blob: 463f8b264c6f558e8a175b3f5b5ce3cc04c89e2a [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder589d30e2012-07-10 20:30:11 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70
Alex Elder81a89792012-02-02 08:13:30 -060071/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060078#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070079
Alex Eldercc0538b2012-08-10 13:12:07 -070080#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070081
Yehuda Sadeh602adf42010-08-12 16:11:25 -070082/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050086 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050087 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050088 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Elderf84344f2012-08-31 17:29:51 -050093 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096 char *snap_names;
97 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070098
99 u64 obj_version;
100};
101
102struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700103 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104};
105
106/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600107 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600116 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700132};
133
Alex Elderf0f8cef2012-01-29 13:57:44 -0600134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800146struct rbd_snap {
147 struct device dev;
148 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800149 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800150 struct list_head node;
151 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500152 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800153};
154
Alex Elderf84344f2012-08-31 17:29:51 -0500155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500158 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500159 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500160 bool snap_exists;
161 bool read_only;
162};
163
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700164/*
165 * a single device
166 */
167struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500168 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700172
Alex Elderf8c38922012-08-10 13:12:07 -0700173 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174 struct rbd_client *rbd_client;
175
176 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
177
178 spinlock_t lock; /* queue lock */
179
180 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500181 char *image_id;
182 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500183 char *image_name;
184 size_t image_name_len;
185 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500186 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500187 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700189 struct ceph_osd_event *watch_event;
190 struct ceph_osd_request *watch_request;
191
Josh Durginc6666012011-11-21 17:11:12 -0800192 /* protects updating the header */
193 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500194
195 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196
197 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800198
199 /* list of snapshots */
200 struct list_head snaps;
201
202 /* sysfs related */
203 struct device dev;
204};
205
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700206static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600207
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600209static DEFINE_SPINLOCK(rbd_dev_list_lock);
210
Alex Elder432b8582012-01-29 13:57:44 -0600211static LIST_HEAD(rbd_client_list); /* clients */
212static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700213
Alex Elder304f6802012-08-31 17:29:52 -0500214static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
215static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
216
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800217static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500218static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800219
Alex Elderf0f8cef2012-01-29 13:57:44 -0600220static ssize_t rbd_add(struct bus_type *bus, const char *buf,
221 size_t count);
222static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
223 size_t count);
224
225static struct bus_attribute rbd_bus_attrs[] = {
226 __ATTR(add, S_IWUSR, NULL, rbd_add),
227 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
228 __ATTR_NULL
229};
230
231static struct bus_type rbd_bus_type = {
232 .name = "rbd",
233 .bus_attrs = rbd_bus_attrs,
234};
235
236static void rbd_root_dev_release(struct device *dev)
237{
238}
239
240static struct device rbd_root_dev = {
241 .init_name = "rbd",
242 .release = rbd_root_dev_release,
243};
244
Alex Elderaafb2302012-09-06 16:00:54 -0500245#ifdef RBD_DEBUG
246#define rbd_assert(expr) \
247 if (unlikely(!(expr))) { \
248 printk(KERN_ERR "\nAssertion failure in %s() " \
249 "at line %d:\n\n" \
250 "\trbd_assert(%s);\n\n", \
251 __func__, __LINE__, #expr); \
252 BUG(); \
253 }
254#else /* !RBD_DEBUG */
255# define rbd_assert(expr) ((void) 0)
256#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
259{
260 return get_device(&rbd_dev->dev);
261}
262
263static void rbd_put_dev(struct rbd_device *rbd_dev)
264{
265 put_device(&rbd_dev->dev);
266}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267
Alex Elder1fe5e992012-07-25 09:32:41 -0500268static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270static int rbd_open(struct block_device *bdev, fmode_t mode)
271{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600272 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700273
Alex Elderf84344f2012-08-31 17:29:51 -0500274 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275 return -EROFS;
276
Alex Elder340c7a22012-08-10 13:12:07 -0700277 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500278 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700279
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700280 return 0;
281}
282
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800283static int rbd_release(struct gendisk *disk, fmode_t mode)
284{
285 struct rbd_device *rbd_dev = disk->private_data;
286
287 rbd_put_dev(rbd_dev);
288
289 return 0;
290}
291
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700292static const struct block_device_operations rbd_bd_ops = {
293 .owner = THIS_MODULE,
294 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800295 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296};
297
298/*
299 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500300 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 */
Alex Elderf8c38922012-08-10 13:12:07 -0700302static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303{
304 struct rbd_client *rbdc;
305 int ret = -ENOMEM;
306
307 dout("rbd_client_create\n");
308 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
309 if (!rbdc)
310 goto out_opt;
311
312 kref_init(&rbdc->kref);
313 INIT_LIST_HEAD(&rbdc->node);
314
Alex Elderbc534d862012-01-29 13:57:44 -0600315 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
316
Alex Elder43ae4702012-07-03 16:01:18 -0500317 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600319 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500320 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700321
322 ret = ceph_open_session(rbdc->client);
323 if (ret < 0)
324 goto out_err;
325
Alex Elder432b8582012-01-29 13:57:44 -0600326 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600328 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329
Alex Elderbc534d862012-01-29 13:57:44 -0600330 mutex_unlock(&ctl_mutex);
331
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 dout("rbd_client_create created %p\n", rbdc);
333 return rbdc;
334
335out_err:
336 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600337out_mutex:
338 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 kfree(rbdc);
340out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500341 if (ceph_opts)
342 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400343 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344}
345
346/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700347 * Find a ceph client with specific addr and configuration. If
348 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700350static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351{
352 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700353 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
Alex Elder43ae4702012-07-03 16:01:18 -0500355 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356 return NULL;
357
Alex Elder1f7ba332012-08-10 13:12:07 -0700358 spin_lock(&rbd_client_list_lock);
359 list_for_each_entry(client_node, &rbd_client_list, node) {
360 if (!ceph_compare_options(ceph_opts, client_node->client)) {
361 kref_get(&client_node->kref);
362 found = true;
363 break;
364 }
365 }
366 spin_unlock(&rbd_client_list_lock);
367
368 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369}
370
371/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700372 * mount options
373 */
374enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700375 Opt_last_int,
376 /* int args above */
377 Opt_last_string,
378 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700379 Opt_read_only,
380 Opt_read_write,
381 /* Boolean args above */
382 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700383};
384
Alex Elder43ae4702012-07-03 16:01:18 -0500385static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 /* int args above */
387 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500388 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700389 {Opt_read_only, "ro"}, /* Alternate spelling */
390 {Opt_read_write, "read_write"},
391 {Opt_read_write, "rw"}, /* Alternate spelling */
392 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 {-1, NULL}
394};
395
396static int parse_rbd_opts_token(char *c, void *private)
397{
Alex Elder43ae4702012-07-03 16:01:18 -0500398 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700399 substring_t argstr[MAX_OPT_ARGS];
400 int token, intval, ret;
401
Alex Elder43ae4702012-07-03 16:01:18 -0500402 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 if (token < 0)
404 return -EINVAL;
405
406 if (token < Opt_last_int) {
407 ret = match_int(&argstr[0], &intval);
408 if (ret < 0) {
409 pr_err("bad mount option arg (not int) "
410 "at '%s'\n", c);
411 return ret;
412 }
413 dout("got int token %d val %d\n", token, intval);
414 } else if (token > Opt_last_int && token < Opt_last_string) {
415 dout("got string token %d val %s\n", token,
416 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700417 } else if (token > Opt_last_string && token < Opt_last_bool) {
418 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700419 } else {
420 dout("got token %d\n", token);
421 }
422
423 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700424 case Opt_read_only:
425 rbd_opts->read_only = true;
426 break;
427 case Opt_read_write:
428 rbd_opts->read_only = false;
429 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700430 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500431 rbd_assert(false);
432 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433 }
434 return 0;
435}
436
437/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 * Get a ceph client with specific addr and configuration, if one does
439 * not exist create it.
440 */
Alex Elderf8c38922012-08-10 13:12:07 -0700441static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
442 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443{
Alex Elderf8c38922012-08-10 13:12:07 -0700444 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500445 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700446 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700447
Alex Eldercc0538b2012-08-10 13:12:07 -0700448 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449
Alex Elder43ae4702012-07-03 16:01:18 -0500450 ceph_opts = ceph_parse_options(options, mon_addr,
451 mon_addr + mon_addr_len,
452 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700453 if (IS_ERR(ceph_opts))
454 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455
Alex Elder1f7ba332012-08-10 13:12:07 -0700456 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600458 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500459 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700460 } else {
461 rbdc = rbd_client_create(ceph_opts);
462 if (IS_ERR(rbdc))
463 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464 }
Alex Elderf8c38922012-08-10 13:12:07 -0700465 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466
Alex Elderf8c38922012-08-10 13:12:07 -0700467 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
470/*
471 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600472 *
Alex Elder432b8582012-01-29 13:57:44 -0600473 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700474 */
475static void rbd_client_release(struct kref *kref)
476{
477 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
478
479 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500480 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500482 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483
484 ceph_destroy_client(rbdc->client);
485 kfree(rbdc);
486}
487
488/*
489 * Drop reference to ceph client node. If it's not referenced anymore, release
490 * it.
491 */
492static void rbd_put_client(struct rbd_device *rbd_dev)
493{
494 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
495 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700496}
497
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700498/*
499 * Destroy requests collection
500 */
501static void rbd_coll_release(struct kref *kref)
502{
503 struct rbd_req_coll *coll =
504 container_of(kref, struct rbd_req_coll, kref);
505
506 dout("rbd_coll_release %p\n", coll);
507 kfree(coll);
508}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700509
Alex Elder8e94af82012-07-25 09:32:40 -0500510static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
511{
Alex Elder103a1502012-08-02 11:29:45 -0500512 size_t size;
513 u32 snap_count;
514
515 /* The header has to start with the magic rbd header text */
516 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
517 return false;
518
519 /*
520 * The size of a snapshot header has to fit in a size_t, and
521 * that limits the number of snapshots.
522 */
523 snap_count = le32_to_cpu(ondisk->snap_count);
524 size = SIZE_MAX - sizeof (struct ceph_snap_context);
525 if (snap_count > size / sizeof (__le64))
526 return false;
527
528 /*
529 * Not only that, but the size of the entire the snapshot
530 * header must also be representable in a size_t.
531 */
532 size -= snap_count * sizeof (__le64);
533 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
534 return false;
535
536 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500537}
538
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539/*
540 * Create a new header structure, translate header format from the on-disk
541 * header.
542 */
543static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500544 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
Alex Elderccece232012-07-10 20:30:10 -0500546 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500547 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500548 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500549 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550
Alex Elder6a523252012-07-19 17:12:59 -0500551 memset(header, 0, sizeof (*header));
552
Alex Elder103a1502012-08-02 11:29:45 -0500553 snap_count = le32_to_cpu(ondisk->snap_count);
554
Alex Elder58c17b02012-08-23 23:22:06 -0500555 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
556 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500557 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500559 memcpy(header->object_prefix, ondisk->object_prefix, len);
560 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600561
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500563 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
564
Alex Elder621901d2012-08-23 23:22:06 -0500565 /* Save a copy of the snapshot names */
566
Alex Elderf785cc12012-08-23 23:22:06 -0500567 if (snap_names_len > (u64) SIZE_MAX)
568 return -EIO;
569 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500571 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500572 /*
573 * Note that rbd_dev_v1_header_read() guarantees
574 * the ondisk buffer we're working with has
575 * snap_names_len bytes beyond the end of the
576 * snapshot id array, this memcpy() is safe.
577 */
578 memcpy(header->snap_names, &ondisk->snaps[snap_count],
579 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500580
Alex Elder621901d2012-08-23 23:22:06 -0500581 /* Record each snapshot's size */
582
Alex Elderd2bb24e2012-07-26 23:37:14 -0500583 size = snap_count * sizeof (*header->snap_sizes);
584 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500586 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500587 for (i = 0; i < snap_count; i++)
588 header->snap_sizes[i] =
589 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 } else {
Alex Elderccece232012-07-10 20:30:10 -0500591 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 header->snap_names = NULL;
593 header->snap_sizes = NULL;
594 }
Alex Elder849b4262012-07-09 21:04:24 -0500595
Alex Elder34b13182012-07-13 20:35:12 -0500596 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 header->obj_order = ondisk->options.order;
598 header->crypt_type = ondisk->options.crypt_type;
599 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Allocate and fill in the snapshot context */
602
Alex Elderf84344f2012-08-31 17:29:51 -0500603 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500604 size = sizeof (struct ceph_snap_context);
605 size += snap_count * sizeof (header->snapc->snaps[0]);
606 header->snapc = kzalloc(size, GFP_KERNEL);
607 if (!header->snapc)
608 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
610 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500611 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500613 for (i = 0; i < snap_count; i++)
614 header->snapc->snaps[i] =
615 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616
617 return 0;
618
Alex Elder6a523252012-07-19 17:12:59 -0500619out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500621 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500623 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500624 kfree(header->object_prefix);
625 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500626
Alex Elder00f1f362012-02-07 12:03:36 -0600627 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628}
629
Alex Elder8836b992012-08-30 14:42:15 -0500630static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632
Alex Eldere86924a2012-07-10 20:30:11 -0500633 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600634
Alex Eldere86924a2012-07-10 20:30:11 -0500635 list_for_each_entry(snap, &rbd_dev->snaps, node) {
636 if (!strcmp(snap_name, snap->name)) {
637 rbd_dev->mapping.snap_id = snap->id;
638 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500639 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600640
Alex Eldere86924a2012-07-10 20:30:11 -0500641 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600642 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 }
Alex Eldere86924a2012-07-10 20:30:11 -0500644
Alex Elder00f1f362012-02-07 12:03:36 -0600645 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646}
647
Alex Elder5ed16172012-08-29 17:11:07 -0500648static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700649{
Alex Elder78dc4472012-07-19 08:49:18 -0500650 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder4e1105a2012-08-31 17:29:52 -0500652 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800653 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500654 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500655 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500656 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500657 rbd_dev->mapping.snap_exists = false;
658 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500659 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500661 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662 if (ret < 0)
663 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500664 rbd_dev->mapping.snap_exists = true;
665 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500667 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 return ret;
670}
671
672static void rbd_header_free(struct rbd_image_header *header)
673{
Alex Elder849b4262012-07-09 21:04:24 -0500674 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500675 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500677 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500678 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500679 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800680 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500681 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682}
683
Alex Elder65ccfe22012-08-09 10:33:26 -0700684static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685{
Alex Elder65ccfe22012-08-09 10:33:26 -0700686 char *name;
687 u64 segment;
688 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689
Alex Elder65ccfe22012-08-09 10:33:26 -0700690 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
691 if (!name)
692 return NULL;
693 segment = offset >> rbd_dev->header.obj_order;
694 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
695 rbd_dev->header.object_prefix, segment);
696 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
697 pr_err("error formatting segment name for #%llu (%d)\n",
698 segment, ret);
699 kfree(name);
700 name = NULL;
701 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700702
Alex Elder65ccfe22012-08-09 10:33:26 -0700703 return name;
704}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705
Alex Elder65ccfe22012-08-09 10:33:26 -0700706static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
707{
708 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709
Alex Elder65ccfe22012-08-09 10:33:26 -0700710 return offset & (segment_size - 1);
711}
712
713static u64 rbd_segment_length(struct rbd_device *rbd_dev,
714 u64 offset, u64 length)
715{
716 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
717
718 offset &= segment_size - 1;
719
Alex Elderaafb2302012-09-06 16:00:54 -0500720 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 if (offset + length > segment_size)
722 length = segment_size - offset;
723
724 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725}
726
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700727static int rbd_get_num_segments(struct rbd_image_header *header,
728 u64 ofs, u64 len)
729{
Alex Elderdf111be2012-08-09 10:33:26 -0700730 u64 start_seg;
731 u64 end_seg;
732
733 if (!len)
734 return 0;
735 if (len - 1 > U64_MAX - ofs)
736 return -ERANGE;
737
738 start_seg = ofs >> header->obj_order;
739 end_seg = (ofs + len - 1) >> header->obj_order;
740
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700741 return end_seg - start_seg + 1;
742}
743
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700745 * returns the size of an object in the image
746 */
747static u64 rbd_obj_bytes(struct rbd_image_header *header)
748{
749 return 1 << header->obj_order;
750}
751
752/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 * bio helpers
754 */
755
756static void bio_chain_put(struct bio *chain)
757{
758 struct bio *tmp;
759
760 while (chain) {
761 tmp = chain;
762 chain = chain->bi_next;
763 bio_put(tmp);
764 }
765}
766
767/*
768 * zeros a bio chain, starting at specific offset
769 */
770static void zero_bio_chain(struct bio *chain, int start_ofs)
771{
772 struct bio_vec *bv;
773 unsigned long flags;
774 void *buf;
775 int i;
776 int pos = 0;
777
778 while (chain) {
779 bio_for_each_segment(bv, chain, i) {
780 if (pos + bv->bv_len > start_ofs) {
781 int remainder = max(start_ofs - pos, 0);
782 buf = bvec_kmap_irq(bv, &flags);
783 memset(buf + remainder, 0,
784 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200785 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786 }
787 pos += bv->bv_len;
788 }
789
790 chain = chain->bi_next;
791 }
792}
793
794/*
795 * bio_chain_clone - clone a chain of bios up to a certain length.
796 * might return a bio_pair that will need to be released.
797 */
798static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
799 struct bio_pair **bp,
800 int len, gfp_t gfpmask)
801{
Alex Elder542582f2012-08-09 10:33:25 -0700802 struct bio *old_chain = *old;
803 struct bio *new_chain = NULL;
804 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700805 int total = 0;
806
807 if (*bp) {
808 bio_pair_release(*bp);
809 *bp = NULL;
810 }
811
812 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700813 struct bio *tmp;
814
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
816 if (!tmp)
817 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700818 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700819
820 if (total + old_chain->bi_size > len) {
821 struct bio_pair *bp;
822
823 /*
824 * this split can only happen with a single paged bio,
825 * split_bio will BUG_ON if this is not the case
826 */
827 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500828 "bi_size=%u\n",
829 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830
831 /* split the bio. We'll release it either in the next
832 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600833 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834 if (!bp)
835 goto err_out;
836
837 __bio_clone(tmp, &bp->bio1);
838
839 *next = &bp->bio2;
840 } else {
841 __bio_clone(tmp, old_chain);
842 *next = old_chain->bi_next;
843 }
844
845 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700847 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700849 else
850 new_chain = tmp;
851 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700852 old_chain = old_chain->bi_next;
853
854 total += tmp->bi_size;
855 }
856
Alex Elderaafb2302012-09-06 16:00:54 -0500857 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 *old = old_chain;
860
861 return new_chain;
862
863err_out:
864 dout("bio_chain_clone with err\n");
865 bio_chain_put(new_chain);
866 return NULL;
867}
868
869/*
870 * helpers for osd request op vectors.
871 */
Alex Elder57cfc102012-06-26 12:57:03 -0700872static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
873 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
Alex Elder57cfc102012-06-26 12:57:03 -0700875 struct ceph_osd_req_op *ops;
876
877 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
878 if (!ops)
879 return NULL;
880
881 ops[0].op = opcode;
882
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700883 /*
884 * op extent offset and length will be set later on
885 * in calc_raw_layout()
886 */
Alex Elder57cfc102012-06-26 12:57:03 -0700887 ops[0].payload_len = payload_len;
888
889 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890}
891
892static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
893{
894 kfree(ops);
895}
896
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700897static void rbd_coll_end_req_index(struct request *rq,
898 struct rbd_req_coll *coll,
899 int index,
900 int ret, u64 len)
901{
902 struct request_queue *q;
903 int min, max, i;
904
Alex Elderbd919d42012-07-13 20:35:11 -0500905 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
906 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700907
908 if (!rq)
909 return;
910
911 if (!coll) {
912 blk_end_request(rq, ret, len);
913 return;
914 }
915
916 q = rq->q;
917
918 spin_lock_irq(q->queue_lock);
919 coll->status[index].done = 1;
920 coll->status[index].rc = ret;
921 coll->status[index].bytes = len;
922 max = min = coll->num_done;
923 while (max < coll->total && coll->status[max].done)
924 max++;
925
926 for (i = min; i<max; i++) {
927 __blk_end_request(rq, coll->status[i].rc,
928 coll->status[i].bytes);
929 coll->num_done++;
930 kref_put(&coll->kref, rbd_coll_release);
931 }
932 spin_unlock_irq(q->queue_lock);
933}
934
935static void rbd_coll_end_req(struct rbd_request *req,
936 int ret, u64 len)
937{
938 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
939}
940
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941/*
942 * Send ceph osd request
943 */
944static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500945 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946 struct ceph_snap_context *snapc,
947 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500948 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 struct bio *bio,
950 struct page **pages,
951 int num_pages,
952 int flags,
953 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700954 struct rbd_req_coll *coll,
955 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700957 struct ceph_msg *msg),
958 struct ceph_osd_request **linger_req,
959 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700960{
961 struct ceph_osd_request *req;
962 struct ceph_file_layout *layout;
963 int ret;
964 u64 bno;
965 struct timespec mtime = CURRENT_TIME;
966 struct rbd_request *req_data;
967 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600968 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700971 if (!req_data) {
972 if (coll)
973 rbd_coll_end_req_index(rq, coll, coll_index,
974 -ENOMEM, len);
975 return -ENOMEM;
976 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700978 if (coll) {
979 req_data->coll = coll;
980 req_data->coll_index = coll_index;
981 }
982
Alex Elderbd919d42012-07-13 20:35:11 -0500983 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
984 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985
Alex Elder0ce1a792012-07-03 16:01:18 -0500986 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600987 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
988 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700989 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700990 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991 goto done_pages;
992 }
993
994 req->r_callback = rbd_cb;
995
996 req_data->rq = rq;
997 req_data->bio = bio;
998 req_data->pages = pages;
999 req_data->len = len;
1000
1001 req->r_priv = req_data;
1002
1003 reqhead = req->r_request->front.iov_base;
1004 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1005
Alex Elderaded07e2012-07-03 16:01:18 -05001006 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007 req->r_oid_len = strlen(req->r_oid);
1008
1009 layout = &req->r_file_layout;
1010 memset(layout, 0, sizeof(*layout));
1011 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1012 layout->fl_stripe_count = cpu_to_le32(1);
1013 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001014 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001015 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1016 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017
1018 ceph_osdc_build_request(req, ofs, &len,
1019 ops,
1020 snapc,
1021 &mtime,
1022 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001024 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001025 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001026 *linger_req = req;
1027 }
1028
Alex Elder1dbb4392012-01-24 10:08:37 -06001029 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030 if (ret < 0)
1031 goto done_err;
1032
1033 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001034 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001035 if (ver)
1036 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001037 dout("reassert_ver=%llu\n",
1038 (unsigned long long)
1039 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 ceph_osdc_put_request(req);
1041 }
1042 return ret;
1043
1044done_err:
1045 bio_chain_put(req_data->bio);
1046 ceph_osdc_put_request(req);
1047done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001048 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001049 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050 return ret;
1051}
1052
1053/*
1054 * Ceph osd op callback
1055 */
1056static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1057{
1058 struct rbd_request *req_data = req->r_priv;
1059 struct ceph_osd_reply_head *replyhead;
1060 struct ceph_osd_op *op;
1061 __s32 rc;
1062 u64 bytes;
1063 int read_op;
1064
1065 /* parse reply */
1066 replyhead = msg->front.iov_base;
1067 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1068 op = (void *)(replyhead + 1);
1069 rc = le32_to_cpu(replyhead->result);
1070 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001071 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072
Alex Elderbd919d42012-07-13 20:35:11 -05001073 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1074 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075
1076 if (rc == -ENOENT && read_op) {
1077 zero_bio_chain(req_data->bio, 0);
1078 rc = 0;
1079 } else if (rc == 0 && read_op && bytes < req_data->len) {
1080 zero_bio_chain(req_data->bio, bytes);
1081 bytes = req_data->len;
1082 }
1083
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001084 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085
1086 if (req_data->bio)
1087 bio_chain_put(req_data->bio);
1088
1089 ceph_osdc_put_request(req);
1090 kfree(req_data);
1091}
1092
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001093static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1094{
1095 ceph_osdc_put_request(req);
1096}
1097
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098/*
1099 * Do a synchronous ceph osd operation
1100 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001101static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 struct ceph_snap_context *snapc,
1103 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001105 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001106 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001107 u64 ofs, u64 inbound_size,
1108 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001109 struct ceph_osd_request **linger_req,
1110 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111{
1112 int ret;
1113 struct page **pages;
1114 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001115
Alex Elderaafb2302012-09-06 16:00:54 -05001116 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
Alex Elderf8d4de62012-07-03 16:01:19 -05001118 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001119 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001120 if (IS_ERR(pages))
1121 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elder0ce1a792012-07-03 16:01:18 -05001123 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001124 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125 pages, num_pages,
1126 flags,
1127 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001128 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001129 NULL,
1130 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001132 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133
Alex Elderf8d4de62012-07-03 16:01:19 -05001134 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1135 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137done:
1138 ceph_release_page_vector(pages, num_pages);
1139 return ret;
1140}
1141
1142/*
1143 * Do an asynchronous ceph osd operation
1144 */
1145static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001146 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001147 struct ceph_snap_context *snapc,
1148 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001149 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001151 struct bio *bio,
1152 struct rbd_req_coll *coll,
1153 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154{
1155 char *seg_name;
1156 u64 seg_ofs;
1157 u64 seg_len;
1158 int ret;
1159 struct ceph_osd_req_op *ops;
1160 u32 payload_len;
1161
Alex Elder65ccfe22012-08-09 10:33:26 -07001162 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163 if (!seg_name)
1164 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001165 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1166 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167
1168 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1169
Alex Elder57cfc102012-06-26 12:57:03 -07001170 ret = -ENOMEM;
1171 ops = rbd_create_rw_ops(1, opcode, payload_len);
1172 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173 goto done;
1174
1175 /* we've taken care of segment sizes earlier when we
1176 cloned the bios. We should never have a segment
1177 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001178 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179
1180 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1181 seg_name, seg_ofs, seg_len,
1182 bio,
1183 NULL, 0,
1184 flags,
1185 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001188
1189 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190done:
1191 kfree(seg_name);
1192 return ret;
1193}
1194
1195/*
1196 * Request async osd write
1197 */
1198static int rbd_req_write(struct request *rq,
1199 struct rbd_device *rbd_dev,
1200 struct ceph_snap_context *snapc,
1201 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001202 struct bio *bio,
1203 struct rbd_req_coll *coll,
1204 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205{
1206 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1207 CEPH_OSD_OP_WRITE,
1208 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210}
1211
1212/*
1213 * Request async osd read
1214 */
1215static int rbd_req_read(struct request *rq,
1216 struct rbd_device *rbd_dev,
1217 u64 snapid,
1218 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001219 struct bio *bio,
1220 struct rbd_req_coll *coll,
1221 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222{
1223 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001224 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 CEPH_OSD_OP_READ,
1226 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001227 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228}
1229
1230/*
1231 * Request sync osd read
1232 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001233static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001235 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001236 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237 char *buf,
1238 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239{
Alex Elder913d2fd2012-06-26 12:57:03 -07001240 struct ceph_osd_req_op *ops;
1241 int ret;
1242
1243 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1244 if (!ops)
1245 return -ENOMEM;
1246
1247 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001248 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001250 ops, object_name, ofs, len, buf, NULL, ver);
1251 rbd_destroy_ops(ops);
1252
1253 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001254}
1255
1256/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 * Request sync osd watch
1258 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001259static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001261 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262{
1263 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001264 int ret;
1265
Alex Elder57cfc102012-06-26 12:57:03 -07001266 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1267 if (!ops)
1268 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269
Josh Durgina71b8912011-12-05 18:10:44 -08001270 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271 ops[0].watch.cookie = notify_id;
1272 ops[0].watch.flag = 0;
1273
Alex Elder0ce1a792012-07-03 16:01:18 -05001274 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001275 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001276 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277 CEPH_OSD_FLAG_READ,
1278 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001279 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001280 rbd_simple_req_cb, 0, NULL);
1281
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
1286static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1287{
Alex Elder0ce1a792012-07-03 16:01:18 -05001288 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001289 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001290 int rc;
1291
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293 return;
1294
Alex Elderbd919d42012-07-13 20:35:11 -05001295 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1296 rbd_dev->header_name, (unsigned long long) notify_id,
1297 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001298 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001299 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001300 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302
Alex Elder7f0a24d2012-07-25 09:32:40 -05001303 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001304}
1305
1306/*
1307 * Request sync osd watch
1308 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001309static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001310{
1311 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001312 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001313 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001314
Alex Elder57cfc102012-06-26 12:57:03 -07001315 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1316 if (!ops)
1317 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001318
1319 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001320 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 if (ret < 0)
1322 goto fail;
1323
Alex Elder0e6f3222012-07-25 09:32:40 -05001324 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001325 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326 ops[0].watch.flag = 1;
1327
Alex Elder0ce1a792012-07-03 16:01:18 -05001328 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1331 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001332 rbd_dev->header_name,
1333 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335
1336 if (ret < 0)
1337 goto fail_event;
1338
1339 rbd_destroy_ops(ops);
1340 return 0;
1341
1342fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001343 ceph_osdc_cancel_event(rbd_dev->watch_event);
1344 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001345fail:
1346 rbd_destroy_ops(ops);
1347 return ret;
1348}
1349
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001350/*
1351 * Request sync osd unwatch
1352 */
Alex Elder070c6332012-07-25 09:32:41 -05001353static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001354{
1355 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001356 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001357
Alex Elder57cfc102012-06-26 12:57:03 -07001358 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1359 if (!ops)
1360 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001361
1362 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001363 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001364 ops[0].watch.flag = 0;
1365
Alex Elder0ce1a792012-07-03 16:01:18 -05001366 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001367 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1369 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001370 rbd_dev->header_name,
1371 0, 0, NULL, NULL, NULL);
1372
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001373
1374 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001375 ceph_osdc_cancel_event(rbd_dev->watch_event);
1376 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001377 return ret;
1378}
1379
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001381 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001383static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001384 const char *object_name,
1385 const char *class_name,
1386 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001387 const char *outbound,
1388 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001389 char *inbound,
1390 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001391 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393{
1394 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001395 int class_name_len = strlen(class_name);
1396 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001397 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001398 int ret;
1399
Alex Elder3cb4a682012-06-26 12:57:03 -07001400 /*
1401 * Any input parameters required by the method we're calling
1402 * will be sent along with the class and method names as
1403 * part of the message payload. That data and its size are
1404 * supplied via the indata and indata_len fields (named from
1405 * the perspective of the server side) in the OSD request
1406 * operation.
1407 */
1408 payload_size = class_name_len + method_name_len + outbound_size;
1409 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001410 if (!ops)
1411 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001412
Alex Elderaded07e2012-07-03 16:01:18 -05001413 ops[0].cls.class_name = class_name;
1414 ops[0].cls.class_len = (__u8) class_name_len;
1415 ops[0].cls.method_name = method_name;
1416 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001418 ops[0].cls.indata = outbound;
1419 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420
Alex Elder0ce1a792012-07-03 16:01:18 -05001421 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001422 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001423 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001424 object_name, 0, inbound_size, inbound,
1425 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426
1427 rbd_destroy_ops(ops);
1428
1429 dout("cls_exec returned %d\n", ret);
1430 return ret;
1431}
1432
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001433static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1434{
1435 struct rbd_req_coll *coll =
1436 kzalloc(sizeof(struct rbd_req_coll) +
1437 sizeof(struct rbd_req_status) * num_reqs,
1438 GFP_ATOMIC);
1439
1440 if (!coll)
1441 return NULL;
1442 coll->total = num_reqs;
1443 kref_init(&coll->kref);
1444 return coll;
1445}
1446
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447/*
1448 * block device queue callback
1449 */
1450static void rbd_rq_fn(struct request_queue *q)
1451{
1452 struct rbd_device *rbd_dev = q->queuedata;
1453 struct request *rq;
1454 struct bio_pair *bp = NULL;
1455
Alex Elder00f1f362012-02-07 12:03:36 -06001456 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457 struct bio *bio;
1458 struct bio *rq_bio, *next_bio = NULL;
1459 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001460 unsigned int size;
1461 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001462 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001463 int num_segs, cur_seg = 0;
1464 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001465 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001466
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467 dout("fetched request\n");
1468
1469 /* filter out block requests we don't understand */
1470 if ((rq->cmd_type != REQ_TYPE_FS)) {
1471 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001472 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 }
1474
1475 /* deduce our operation (read, write) */
1476 do_write = (rq_data_dir(rq) == WRITE);
1477
1478 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001479 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001481 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001483 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001484 }
1485
1486 spin_unlock_irq(q->queue_lock);
1487
Josh Durgind1d25642011-12-05 14:03:05 -08001488 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001489
Alex Elderf84344f2012-08-31 17:29:51 -05001490 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1491 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001492 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001493 dout("request for non-existent snapshot");
1494 spin_lock_irq(q->queue_lock);
1495 __blk_end_request_all(rq, -ENXIO);
1496 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001497 }
1498
Josh Durgind1d25642011-12-05 14:03:05 -08001499 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1500
1501 up_read(&rbd_dev->header_rwsem);
1502
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503 dout("%s 0x%x bytes at 0x%llx\n",
1504 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001505 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001506
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001507 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001508 if (num_segs <= 0) {
1509 spin_lock_irq(q->queue_lock);
1510 __blk_end_request_all(rq, num_segs);
1511 ceph_put_snap_context(snapc);
1512 continue;
1513 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 coll = rbd_alloc_coll(num_segs);
1515 if (!coll) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001518 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001519 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 }
1521
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001522 do {
1523 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001524 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001525 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1528 op_size, GFP_ATOMIC);
1529 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 rbd_coll_end_req_index(rq, coll, cur_seg,
1531 -ENOMEM, op_size);
1532 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 }
1534
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001535
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 /* init OSD command: write or read */
1537 if (do_write)
1538 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001539 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 op_size, bio,
1542 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 else
1544 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001545 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 op_size, bio,
1548 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551 size -= op_size;
1552 ofs += op_size;
1553
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001554 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 rq_bio = next_bio;
1556 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001557 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558
1559 if (bp)
1560 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001562
1563 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 }
1565}
1566
1567/*
1568 * a queue callback. Makes sure that we don't create a bio that spans across
1569 * multiple osd objects. One exception would be with a single page bios,
1570 * which we handle later at bio_chain_clone
1571 */
1572static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1573 struct bio_vec *bvec)
1574{
1575 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001576 unsigned int chunk_sectors;
1577 sector_t sector;
1578 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 int max;
1580
Alex Elder593a9e72012-02-07 12:03:37 -06001581 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1582 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1583 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1584
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001586 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001587 if (max < 0)
1588 max = 0; /* bio_add cannot handle a negative return */
1589 if (max <= bvec->bv_len && bio_sectors == 0)
1590 return bvec->bv_len;
1591 return max;
1592}
1593
1594static void rbd_free_disk(struct rbd_device *rbd_dev)
1595{
1596 struct gendisk *disk = rbd_dev->disk;
1597
1598 if (!disk)
1599 return;
1600
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001601 if (disk->flags & GENHD_FL_UP)
1602 del_gendisk(disk);
1603 if (disk->queue)
1604 blk_cleanup_queue(disk->queue);
1605 put_disk(disk);
1606}
1607
1608/*
Alex Elder4156d992012-08-02 11:29:46 -05001609 * Read the complete header for the given rbd device.
1610 *
1611 * Returns a pointer to a dynamically-allocated buffer containing
1612 * the complete and validated header. Caller can pass the address
1613 * of a variable that will be filled in with the version of the
1614 * header object at the time it was read.
1615 *
1616 * Returns a pointer-coded errno if a failure occurs.
1617 */
1618static struct rbd_image_header_ondisk *
1619rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1620{
1621 struct rbd_image_header_ondisk *ondisk = NULL;
1622 u32 snap_count = 0;
1623 u64 names_size = 0;
1624 u32 want_count;
1625 int ret;
1626
1627 /*
1628 * The complete header will include an array of its 64-bit
1629 * snapshot ids, followed by the names of those snapshots as
1630 * a contiguous block of NUL-terminated strings. Note that
1631 * the number of snapshots could change by the time we read
1632 * it in, in which case we re-read it.
1633 */
1634 do {
1635 size_t size;
1636
1637 kfree(ondisk);
1638
1639 size = sizeof (*ondisk);
1640 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1641 size += names_size;
1642 ondisk = kmalloc(size, GFP_KERNEL);
1643 if (!ondisk)
1644 return ERR_PTR(-ENOMEM);
1645
1646 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1647 rbd_dev->header_name,
1648 0, size,
1649 (char *) ondisk, version);
1650
1651 if (ret < 0)
1652 goto out_err;
1653 if (WARN_ON((size_t) ret < size)) {
1654 ret = -ENXIO;
1655 pr_warning("short header read for image %s"
1656 " (want %zd got %d)\n",
1657 rbd_dev->image_name, size, ret);
1658 goto out_err;
1659 }
1660 if (!rbd_dev_ondisk_valid(ondisk)) {
1661 ret = -ENXIO;
1662 pr_warning("invalid header for image %s\n",
1663 rbd_dev->image_name);
1664 goto out_err;
1665 }
1666
1667 names_size = le64_to_cpu(ondisk->snap_names_len);
1668 want_count = snap_count;
1669 snap_count = le32_to_cpu(ondisk->snap_count);
1670 } while (snap_count != want_count);
1671
1672 return ondisk;
1673
1674out_err:
1675 kfree(ondisk);
1676
1677 return ERR_PTR(ret);
1678}
1679
1680/*
1681 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682 */
1683static int rbd_read_header(struct rbd_device *rbd_dev,
1684 struct rbd_image_header *header)
1685{
Alex Elder4156d992012-08-02 11:29:46 -05001686 struct rbd_image_header_ondisk *ondisk;
1687 u64 ver = 0;
1688 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689
Alex Elder4156d992012-08-02 11:29:46 -05001690 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1691 if (IS_ERR(ondisk))
1692 return PTR_ERR(ondisk);
1693 ret = rbd_header_from_disk(header, ondisk);
1694 if (ret >= 0)
1695 header->obj_version = ver;
1696 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697
Alex Elder4156d992012-08-02 11:29:46 -05001698 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001699}
1700
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001701static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1702{
1703 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001704 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001705
Alex Eldera0593292012-07-19 09:09:27 -05001706 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001707 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708}
1709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
Alex Elderb8136232012-07-25 09:32:41 -05001713static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714{
1715 int ret;
1716 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
Josh Durgina51aa0c2011-12-05 10:35:04 -08001722 down_write(&rbd_dev->header_rwsem);
1723
Sage Weil9db4b3e2011-04-19 22:49:06 -07001724 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001725 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
Alex Elder99c1f082012-08-30 14:42:15 -05001728 if (size != (sector_t) rbd_dev->mapping.size) {
1729 dout("setting size to %llu sectors",
1730 (unsigned long long) size);
1731 rbd_dev->mapping.size = (u64) size;
1732 set_capacity(rbd_dev->disk, size);
1733 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001734 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001735
Alex Elder849b4262012-07-09 21:04:24 -05001736 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001738 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001739 /* osd requests may still refer to snapc */
1740 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741
Alex Elderb8136232012-07-25 09:32:41 -05001742 if (hver)
1743 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001744 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001745 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746 rbd_dev->header.snapc = h.snapc;
1747 rbd_dev->header.snap_names = h.snap_names;
1748 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001749 /* Free the extra copy of the object prefix */
1750 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1751 kfree(h.object_prefix);
1752
Alex Elder304f6802012-08-31 17:29:52 -05001753 ret = rbd_dev_snaps_update(rbd_dev);
1754 if (!ret)
1755 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001756
Josh Durginc6666012011-11-21 17:11:12 -08001757 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001759 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760}
1761
Alex Elder1fe5e992012-07-25 09:32:41 -05001762static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1763{
1764 int ret;
1765
1766 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1767 ret = __rbd_refresh_header(rbd_dev, hver);
1768 mutex_unlock(&ctl_mutex);
1769
1770 return ret;
1771}
1772
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001773static int rbd_init_disk(struct rbd_device *rbd_dev)
1774{
1775 struct gendisk *disk;
1776 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001777 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1781 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001782 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783
Alex Elderf0f8cef2012-01-29 13:57:44 -06001784 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001785 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 disk->major = rbd_dev->major;
1787 disk->first_minor = 0;
1788 disk->fops = &rbd_bd_ops;
1789 disk->private_data = rbd_dev;
1790
1791 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1793 if (!q)
1794 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001795
Alex Elder593a9e72012-02-07 12:03:37 -06001796 /* We use the default size, but let's be explicit about it. */
1797 blk_queue_physical_block_size(q, SECTOR_SIZE);
1798
Josh Durgin029bcbd2011-07-22 11:35:23 -07001799 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001800 segment_size = rbd_obj_bytes(&rbd_dev->header);
1801 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1802 blk_queue_max_segment_size(q, segment_size);
1803 blk_queue_io_min(q, segment_size);
1804 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001805
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806 blk_queue_merge_bvec(q, rbd_merge_bvec);
1807 disk->queue = q;
1808
1809 q->queuedata = rbd_dev;
1810
1811 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
Alex Elder12f02942012-08-29 17:11:07 -05001813 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1814
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816out_disk:
1817 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001818
1819 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820}
1821
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001822/*
1823 sysfs
1824*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825
Alex Elder593a9e72012-02-07 12:03:37 -06001826static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1827{
1828 return container_of(dev, struct rbd_device, dev);
1829}
1830
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001831static ssize_t rbd_size_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001833{
Alex Elder593a9e72012-02-07 12:03:37 -06001834 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001835 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836
Josh Durgina51aa0c2011-12-05 10:35:04 -08001837 down_read(&rbd_dev->header_rwsem);
1838 size = get_capacity(rbd_dev->disk);
1839 up_read(&rbd_dev->header_rwsem);
1840
1841 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001842}
1843
Alex Elder34b13182012-07-13 20:35:12 -05001844/*
1845 * Note this shows the features for whatever's mapped, which is not
1846 * necessarily the base image.
1847 */
1848static ssize_t rbd_features_show(struct device *dev,
1849 struct device_attribute *attr, char *buf)
1850{
1851 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1852
1853 return sprintf(buf, "0x%016llx\n",
1854 (unsigned long long) rbd_dev->mapping.features);
1855}
1856
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001857static ssize_t rbd_major_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859{
Alex Elder593a9e72012-02-07 12:03:37 -06001860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
1862 return sprintf(buf, "%d\n", rbd_dev->major);
1863}
1864
1865static ssize_t rbd_client_id_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
1867{
Alex Elder593a9e72012-02-07 12:03:37 -06001868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869
Alex Elder1dbb4392012-01-24 10:08:37 -06001870 return sprintf(buf, "client%lld\n",
1871 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872}
1873
1874static ssize_t rbd_pool_show(struct device *dev,
1875 struct device_attribute *attr, char *buf)
1876{
Alex Elder593a9e72012-02-07 12:03:37 -06001877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878
1879 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1880}
1881
Alex Elder9bb2f332012-07-12 10:46:35 -05001882static ssize_t rbd_pool_id_show(struct device *dev,
1883 struct device_attribute *attr, char *buf)
1884{
1885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1886
1887 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1888}
1889
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890static ssize_t rbd_name_show(struct device *dev,
1891 struct device_attribute *attr, char *buf)
1892{
Alex Elder593a9e72012-02-07 12:03:37 -06001893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001894
Alex Elder0bed54d2012-07-03 16:01:18 -05001895 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896}
1897
Alex Elder589d30e2012-07-10 20:30:11 -05001898static ssize_t rbd_image_id_show(struct device *dev,
1899 struct device_attribute *attr, char *buf)
1900{
1901 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1902
1903 return sprintf(buf, "%s\n", rbd_dev->image_id);
1904}
1905
Alex Elder34b13182012-07-13 20:35:12 -05001906/*
1907 * Shows the name of the currently-mapped snapshot (or
1908 * RBD_SNAP_HEAD_NAME for the base image).
1909 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001910static ssize_t rbd_snap_show(struct device *dev,
1911 struct device_attribute *attr,
1912 char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
Alex Elderf84344f2012-08-31 17:29:51 -05001916 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917}
1918
1919static ssize_t rbd_image_refresh(struct device *dev,
1920 struct device_attribute *attr,
1921 const char *buf,
1922 size_t size)
1923{
Alex Elder593a9e72012-02-07 12:03:37 -06001924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001925 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926
Alex Elder1fe5e992012-07-25 09:32:41 -05001927 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001928
1929 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001930}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001933static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001934static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1935static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1936static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001937static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001939static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1941static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942
1943static struct attribute *rbd_attrs[] = {
1944 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001945 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001946 &dev_attr_major.attr,
1947 &dev_attr_client_id.attr,
1948 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001949 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001951 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952 &dev_attr_current_snap.attr,
1953 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954 NULL
1955};
1956
1957static struct attribute_group rbd_attr_group = {
1958 .attrs = rbd_attrs,
1959};
1960
1961static const struct attribute_group *rbd_attr_groups[] = {
1962 &rbd_attr_group,
1963 NULL
1964};
1965
1966static void rbd_sysfs_dev_release(struct device *dev)
1967{
1968}
1969
1970static struct device_type rbd_device_type = {
1971 .name = "rbd",
1972 .groups = rbd_attr_groups,
1973 .release = rbd_sysfs_dev_release,
1974};
1975
1976
1977/*
1978 sysfs - snapshots
1979*/
1980
1981static ssize_t rbd_snap_size_show(struct device *dev,
1982 struct device_attribute *attr,
1983 char *buf)
1984{
1985 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1986
Josh Durgin35915382011-12-05 18:25:13 -08001987 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001988}
1989
1990static ssize_t rbd_snap_id_show(struct device *dev,
1991 struct device_attribute *attr,
1992 char *buf)
1993{
1994 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1995
Josh Durgin35915382011-12-05 18:25:13 -08001996 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997}
1998
Alex Elder34b13182012-07-13 20:35:12 -05001999static ssize_t rbd_snap_features_show(struct device *dev,
2000 struct device_attribute *attr,
2001 char *buf)
2002{
2003 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2004
2005 return sprintf(buf, "0x%016llx\n",
2006 (unsigned long long) snap->features);
2007}
2008
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2010static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002011static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012
2013static struct attribute *rbd_snap_attrs[] = {
2014 &dev_attr_snap_size.attr,
2015 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002016 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002017 NULL,
2018};
2019
2020static struct attribute_group rbd_snap_attr_group = {
2021 .attrs = rbd_snap_attrs,
2022};
2023
2024static void rbd_snap_dev_release(struct device *dev)
2025{
2026 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2027 kfree(snap->name);
2028 kfree(snap);
2029}
2030
2031static const struct attribute_group *rbd_snap_attr_groups[] = {
2032 &rbd_snap_attr_group,
2033 NULL
2034};
2035
2036static struct device_type rbd_snap_device_type = {
2037 .groups = rbd_snap_attr_groups,
2038 .release = rbd_snap_dev_release,
2039};
2040
Alex Elder304f6802012-08-31 17:29:52 -05002041static bool rbd_snap_registered(struct rbd_snap *snap)
2042{
2043 bool ret = snap->dev.type == &rbd_snap_device_type;
2044 bool reg = device_is_registered(&snap->dev);
2045
2046 rbd_assert(!ret ^ reg);
2047
2048 return ret;
2049}
2050
Alex Elder14e70852012-07-19 09:09:27 -05002051static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052{
2053 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002054 if (device_is_registered(&snap->dev))
2055 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002056}
2057
Alex Elder14e70852012-07-19 09:09:27 -05002058static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059 struct device *parent)
2060{
2061 struct device *dev = &snap->dev;
2062 int ret;
2063
2064 dev->type = &rbd_snap_device_type;
2065 dev->parent = parent;
2066 dev->release = rbd_snap_dev_release;
2067 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002068 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2069
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002070 ret = device_register(dev);
2071
2072 return ret;
2073}
2074
Alex Elder4e891e02012-07-10 20:30:10 -05002075static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002076 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002077 u64 snap_id, u64 snap_size,
2078 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079{
Alex Elder4e891e02012-07-10 20:30:10 -05002080 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002081 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002082
2083 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002084 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002085 return ERR_PTR(-ENOMEM);
2086
2087 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002088 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002089 if (!snap->name)
2090 goto err;
2091
Alex Elderc8d18422012-07-10 20:30:11 -05002092 snap->id = snap_id;
2093 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002094 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002095
2096 return snap;
2097
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002098err:
2099 kfree(snap->name);
2100 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002101
2102 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002103}
2104
2105/*
Alex Elder35938152012-08-02 11:29:46 -05002106 * Scan the rbd device's current snapshot list and compare it to the
2107 * newly-received snapshot context. Remove any existing snapshots
2108 * not present in the new snapshot context. Add a new snapshot for
2109 * any snaphots in the snapshot context not in the current list.
2110 * And verify there are no changes to snapshots we already know
2111 * about.
2112 *
2113 * Assumes the snapshots in the snapshot context are sorted by
2114 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2115 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116 */
Alex Elder304f6802012-08-31 17:29:52 -05002117static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118{
Alex Elder35938152012-08-02 11:29:46 -05002119 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2120 const u32 snap_count = snapc->num_snaps;
2121 char *snap_name = rbd_dev->header.snap_names;
2122 struct list_head *head = &rbd_dev->snaps;
2123 struct list_head *links = head->next;
2124 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125
Alex Elder9fcbb802012-08-23 23:48:49 -05002126 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002127 while (index < snap_count || links != head) {
2128 u64 snap_id;
2129 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002130
Alex Elder35938152012-08-02 11:29:46 -05002131 snap_id = index < snap_count ? snapc->snaps[index]
2132 : CEPH_NOSNAP;
2133 snap = links != head ? list_entry(links, struct rbd_snap, node)
2134 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002135 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002136
Alex Elder35938152012-08-02 11:29:46 -05002137 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2138 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139
Alex Elder35938152012-08-02 11:29:46 -05002140 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141
Alex Elderf84344f2012-08-31 17:29:51 -05002142 if (rbd_dev->mapping.snap_id == snap->id)
2143 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002144 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002145 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002146 rbd_dev->mapping.snap_id == snap->id ?
2147 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002148 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002149
Alex Elder35938152012-08-02 11:29:46 -05002150 /* Done with this list entry; advance */
2151
2152 links = next;
2153 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002154 }
Alex Elder35938152012-08-02 11:29:46 -05002155
Alex Elder9fcbb802012-08-23 23:48:49 -05002156 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2157 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002158 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
Alex Elderc8d18422012-07-10 20:30:11 -05002159 struct rbd_image_header *header = &rbd_dev->header;
Alex Elder35938152012-08-02 11:29:46 -05002160 struct rbd_snap *new_snap;
2161
2162 /* We haven't seen this snapshot before */
2163
Alex Elderc8d18422012-07-10 20:30:11 -05002164 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002165 snap_id, header->snap_sizes[index], 0);
Alex Elder9fcbb802012-08-23 23:48:49 -05002166 if (IS_ERR(new_snap)) {
2167 int err = PTR_ERR(new_snap);
2168
2169 dout(" failed to add dev, error %d\n", err);
2170
2171 return err;
2172 }
Alex Elder35938152012-08-02 11:29:46 -05002173
2174 /* New goes before existing, or at end of list */
2175
Alex Elder9fcbb802012-08-23 23:48:49 -05002176 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002177 if (snap)
2178 list_add_tail(&new_snap->node, &snap->node);
2179 else
Alex Elder523f3252012-08-30 00:16:37 -05002180 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002181 } else {
2182 /* Already have this one */
2183
Alex Elder9fcbb802012-08-23 23:48:49 -05002184 dout(" already present\n");
2185
Alex Elderaafb2302012-09-06 16:00:54 -05002186 rbd_assert(snap->size ==
2187 rbd_dev->header.snap_sizes[index]);
2188 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002189
2190 /* Done with this list entry; advance */
2191
2192 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002193 }
Alex Elder35938152012-08-02 11:29:46 -05002194
2195 /* Advance to the next entry in the snapshot context */
2196
2197 index++;
2198 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002199 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002200 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002201
2202 return 0;
2203}
2204
Alex Elder304f6802012-08-31 17:29:52 -05002205/*
2206 * Scan the list of snapshots and register the devices for any that
2207 * have not already been registered.
2208 */
2209static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2210{
2211 struct rbd_snap *snap;
2212 int ret = 0;
2213
2214 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002215 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2216 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002217
2218 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2219 if (!rbd_snap_registered(snap)) {
2220 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2221 if (ret < 0)
2222 break;
2223 }
2224 }
2225 dout("%s: returning %d\n", __func__, ret);
2226
2227 return ret;
2228}
2229
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002230static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2231{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002232 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002233 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002234
2235 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002236
Alex Eldercd789ab2012-08-30 00:16:38 -05002237 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002238 dev->bus = &rbd_bus_type;
2239 dev->type = &rbd_device_type;
2240 dev->parent = &rbd_root_dev;
2241 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002242 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002243 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002244
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002245 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002246
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002247 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248}
2249
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002250static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2251{
2252 device_unregister(&rbd_dev->dev);
2253}
2254
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002255static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2256{
2257 int ret, rc;
2258
2259 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002260 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002261 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002262 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002263 if (rc < 0)
2264 return rc;
2265 }
2266 } while (ret == -ERANGE);
2267
2268 return ret;
2269}
2270
Alex Eldere2839302012-08-29 17:11:06 -05002271static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002272
2273/*
Alex Elder499afd52012-02-02 08:13:29 -06002274 * Get a unique rbd identifier for the given new rbd_dev, and add
2275 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002276 */
Alex Eldere2839302012-08-29 17:11:06 -05002277static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002278{
Alex Eldere2839302012-08-29 17:11:06 -05002279 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002280
2281 spin_lock(&rbd_dev_list_lock);
2282 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2283 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002284 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2285 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002286}
Alex Elderb7f23c32012-01-29 13:57:43 -06002287
Alex Elder1ddbe942012-01-29 13:57:44 -06002288/*
Alex Elder499afd52012-02-02 08:13:29 -06002289 * Remove an rbd_dev from the global list, and record that its
2290 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002291 */
Alex Eldere2839302012-08-29 17:11:06 -05002292static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002293{
Alex Elderd184f6b2012-01-29 13:57:44 -06002294 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002295 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002296 int max_id;
2297
Alex Elderaafb2302012-09-06 16:00:54 -05002298 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002299
Alex Eldere2839302012-08-29 17:11:06 -05002300 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2301 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002302 spin_lock(&rbd_dev_list_lock);
2303 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002304
2305 /*
2306 * If the id being "put" is not the current maximum, there
2307 * is nothing special we need to do.
2308 */
Alex Eldere2839302012-08-29 17:11:06 -05002309 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002310 spin_unlock(&rbd_dev_list_lock);
2311 return;
2312 }
2313
2314 /*
2315 * We need to update the current maximum id. Search the
2316 * list to find out what it is. We're more likely to find
2317 * the maximum at the end, so search the list backward.
2318 */
2319 max_id = 0;
2320 list_for_each_prev(tmp, &rbd_dev_list) {
2321 struct rbd_device *rbd_dev;
2322
2323 rbd_dev = list_entry(tmp, struct rbd_device, node);
2324 if (rbd_id > max_id)
2325 max_id = rbd_id;
2326 }
Alex Elder499afd52012-02-02 08:13:29 -06002327 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002328
Alex Elder1ddbe942012-01-29 13:57:44 -06002329 /*
Alex Eldere2839302012-08-29 17:11:06 -05002330 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002331 * which case it now accurately reflects the new maximum.
2332 * Be careful not to overwrite the maximum value in that
2333 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002334 */
Alex Eldere2839302012-08-29 17:11:06 -05002335 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2336 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002337}
2338
Alex Eldera725f65e2012-02-02 08:13:30 -06002339/*
Alex Eldere28fff262012-02-02 08:13:30 -06002340 * Skips over white space at *buf, and updates *buf to point to the
2341 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002342 * the token (string of non-white space characters) found. Note
2343 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002344 */
2345static inline size_t next_token(const char **buf)
2346{
2347 /*
2348 * These are the characters that produce nonzero for
2349 * isspace() in the "C" and "POSIX" locales.
2350 */
2351 const char *spaces = " \f\n\r\t\v";
2352
2353 *buf += strspn(*buf, spaces); /* Find start of token */
2354
2355 return strcspn(*buf, spaces); /* Return token length */
2356}
2357
2358/*
2359 * Finds the next token in *buf, and if the provided token buffer is
2360 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002361 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2362 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002363 *
2364 * Returns the length of the token found (not including the '\0').
2365 * Return value will be 0 if no token is found, and it will be >=
2366 * token_size if the token would not fit.
2367 *
Alex Elder593a9e72012-02-07 12:03:37 -06002368 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002369 * found token. Note that this occurs even if the token buffer is
2370 * too small to hold it.
2371 */
2372static inline size_t copy_token(const char **buf,
2373 char *token,
2374 size_t token_size)
2375{
2376 size_t len;
2377
2378 len = next_token(buf);
2379 if (len < token_size) {
2380 memcpy(token, *buf, len);
2381 *(token + len) = '\0';
2382 }
2383 *buf += len;
2384
2385 return len;
2386}
2387
2388/*
Alex Elderea3352f2012-07-09 21:04:23 -05002389 * Finds the next token in *buf, dynamically allocates a buffer big
2390 * enough to hold a copy of it, and copies the token into the new
2391 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2392 * that a duplicate buffer is created even for a zero-length token.
2393 *
2394 * Returns a pointer to the newly-allocated duplicate, or a null
2395 * pointer if memory for the duplicate was not available. If
2396 * the lenp argument is a non-null pointer, the length of the token
2397 * (not including the '\0') is returned in *lenp.
2398 *
2399 * If successful, the *buf pointer will be updated to point beyond
2400 * the end of the found token.
2401 *
2402 * Note: uses GFP_KERNEL for allocation.
2403 */
2404static inline char *dup_token(const char **buf, size_t *lenp)
2405{
2406 char *dup;
2407 size_t len;
2408
2409 len = next_token(buf);
2410 dup = kmalloc(len + 1, GFP_KERNEL);
2411 if (!dup)
2412 return NULL;
2413
2414 memcpy(dup, *buf, len);
2415 *(dup + len) = '\0';
2416 *buf += len;
2417
2418 if (lenp)
2419 *lenp = len;
2420
2421 return dup;
2422}
2423
2424/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002425 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2426 * rbd_md_name, and name fields of the given rbd_dev, based on the
2427 * list of monitor addresses and other options provided via
2428 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2429 * copy of the snapshot name to map if successful, or a
2430 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002431 *
2432 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002433 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002434static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2435 const char *buf,
2436 const char **mon_addrs,
2437 size_t *mon_addrs_size,
2438 char *options,
2439 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002440{
Alex Elderd22f76e2012-07-12 10:46:35 -05002441 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002442 char *err_ptr = ERR_PTR(-EINVAL);
2443 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002444
2445 /* The first four tokens are required */
2446
Alex Elder7ef32142012-02-02 08:13:30 -06002447 len = next_token(&buf);
2448 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002449 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002450 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002451 *mon_addrs = buf;
2452
2453 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002454
Alex Eldere28fff262012-02-02 08:13:30 -06002455 len = copy_token(&buf, options, options_size);
2456 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002457 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002458
Alex Elder3feeb8942012-08-31 17:29:52 -05002459 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002460 rbd_dev->pool_name = dup_token(&buf, NULL);
2461 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002462 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002463
Alex Elder0bed54d2012-07-03 16:01:18 -05002464 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2465 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002466 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002467
Alex Elder3feeb8942012-08-31 17:29:52 -05002468 /* Snapshot name is optional */
2469 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002470 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002471 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2472 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002473 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002474 snap_name = kmalloc(len + 1, GFP_KERNEL);
2475 if (!snap_name)
2476 goto out_err;
2477 memcpy(snap_name, buf, len);
2478 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002479
Alex Elder3feeb8942012-08-31 17:29:52 -05002480dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2481
2482 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002483
2484out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002485 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002486 rbd_dev->image_name = NULL;
2487 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002488 kfree(rbd_dev->pool_name);
2489 rbd_dev->pool_name = NULL;
2490
Alex Elder3feeb8942012-08-31 17:29:52 -05002491 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002492}
2493
Alex Elder589d30e2012-07-10 20:30:11 -05002494/*
2495 * An rbd format 2 image has a unique identifier, distinct from the
2496 * name given to it by the user. Internally, that identifier is
2497 * what's used to specify the names of objects related to the image.
2498 *
2499 * A special "rbd id" object is used to map an rbd image name to its
2500 * id. If that object doesn't exist, then there is no v2 rbd image
2501 * with the supplied name.
2502 *
2503 * This function will record the given rbd_dev's image_id field if
2504 * it can be determined, and in that case will return 0. If any
2505 * errors occur a negative errno will be returned and the rbd_dev's
2506 * image_id field will be unchanged (and should be NULL).
2507 */
2508static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2509{
2510 int ret;
2511 size_t size;
2512 char *object_name;
2513 void *response;
2514 void *p;
2515
2516 /*
2517 * First, see if the format 2 image id file exists, and if
2518 * so, get the image's persistent id from it.
2519 */
2520 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2521 object_name = kmalloc(size, GFP_NOIO);
2522 if (!object_name)
2523 return -ENOMEM;
2524 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2525 dout("rbd id object name is %s\n", object_name);
2526
2527 /* Response will be an encoded string, which includes a length */
2528
2529 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2530 response = kzalloc(size, GFP_NOIO);
2531 if (!response) {
2532 ret = -ENOMEM;
2533 goto out;
2534 }
2535
2536 ret = rbd_req_sync_exec(rbd_dev, object_name,
2537 "rbd", "get_id",
2538 NULL, 0,
2539 response, RBD_IMAGE_ID_LEN_MAX,
2540 CEPH_OSD_FLAG_READ, NULL);
2541 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2542 if (ret < 0)
2543 goto out;
2544
2545 p = response;
2546 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2547 p + RBD_IMAGE_ID_LEN_MAX,
2548 &rbd_dev->image_id_len,
2549 GFP_NOIO);
2550 if (IS_ERR(rbd_dev->image_id)) {
2551 ret = PTR_ERR(rbd_dev->image_id);
2552 rbd_dev->image_id = NULL;
2553 } else {
2554 dout("image_id is %s\n", rbd_dev->image_id);
2555 }
2556out:
2557 kfree(response);
2558 kfree(object_name);
2559
2560 return ret;
2561}
2562
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002563static ssize_t rbd_add(struct bus_type *bus,
2564 const char *buf,
2565 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566{
Alex Eldercb8627c2012-07-09 21:04:23 -05002567 char *options;
2568 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002569 const char *mon_addrs = NULL;
2570 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002571 struct ceph_osd_client *osdc;
2572 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002573 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574
2575 if (!try_module_get(THIS_MODULE))
2576 return -ENODEV;
2577
Alex Elder27cc2592012-02-02 08:13:30 -06002578 options = kmalloc(count, GFP_KERNEL);
2579 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002580 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002581 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2582 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002583 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002584
2585 /* static rbd_device initialization */
2586 spin_lock_init(&rbd_dev->lock);
2587 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002588 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002589 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590
Alex Eldera725f65e2012-02-02 08:13:30 -06002591 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002592 snap_name = rbd_add_parse_args(rbd_dev, buf,
2593 &mon_addrs, &mon_addrs_size, options, count);
2594 if (IS_ERR(snap_name)) {
2595 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002596 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002597 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002598
Alex Elderf8c38922012-08-10 13:12:07 -07002599 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2600 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002601 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002602
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002603 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002604 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002605 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2606 if (rc < 0)
2607 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002608 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002609
Alex Elder589d30e2012-07-10 20:30:11 -05002610 rc = rbd_dev_image_id(rbd_dev);
2611 if (!rc) {
2612 rc = -ENOTSUPP; /* Not actually supporting format 2 yet */
2613 goto err_out_client;
2614 }
2615
2616 /* Version 1 images have no id; empty string is used */
2617
2618 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2619 if (!rbd_dev->image_id) {
2620 rc = -ENOMEM;
2621 goto err_out_client;
2622 }
2623 rbd_dev->image_id_len = 0;
2624
Alex Elder3fcf2582012-07-03 16:01:19 -05002625 /* Create the name of the header object */
2626
2627 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2628 + sizeof (RBD_SUFFIX),
2629 GFP_KERNEL);
2630 if (!rbd_dev->header_name)
2631 goto err_out_client;
2632 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2633
Alex Elder05fd6f62012-08-29 17:11:07 -05002634 /* Get information about the image being mapped */
2635
2636 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2637 if (rc)
2638 goto err_out_client;
2639
2640 /* no need to lock here, as rbd_dev is not registered yet */
2641 rc = rbd_dev_snaps_update(rbd_dev);
2642 if (rc)
2643 goto err_out_header;
2644
2645 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2646 if (rc)
2647 goto err_out_header;
2648
Alex Elder85ae8922012-07-26 23:37:14 -05002649 /* generate unique id: find highest unique id, add one */
2650 rbd_dev_id_get(rbd_dev);
2651
2652 /* Fill in the device name, now that we have its id. */
2653 BUILD_BUG_ON(DEV_NAME_LEN
2654 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2655 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2656
2657 /* Get our block major device number. */
2658
Alex Elder27cc2592012-02-02 08:13:30 -06002659 rc = register_blkdev(0, rbd_dev->name);
2660 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002661 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002662 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002663
Alex Elder0f308a32012-08-29 17:11:07 -05002664 /* Set up the blkdev mapping. */
2665
2666 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002667 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002668 goto err_out_blkdev;
2669
Alex Elder0f308a32012-08-29 17:11:07 -05002670 rc = rbd_bus_add_dev(rbd_dev);
2671 if (rc)
2672 goto err_out_disk;
2673
Alex Elder32eec682012-02-08 16:11:14 -06002674 /*
2675 * At this point cleanup in the event of an error is the job
2676 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002677 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002678
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002679 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002680 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002681 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002682 if (rc)
2683 goto err_out_bus;
2684
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002685 rc = rbd_init_watch_dev(rbd_dev);
2686 if (rc)
2687 goto err_out_bus;
2688
Alex Elder3ee40012012-08-29 17:11:07 -05002689 /* Everything's ready. Announce the disk to the world. */
2690
2691 add_disk(rbd_dev->disk);
2692
2693 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2694 (unsigned long long) rbd_dev->mapping.size);
2695
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002696 return count;
2697
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002698err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002699 /* this will also clean up rest of rbd_dev stuff */
2700
2701 rbd_bus_del_dev(rbd_dev);
2702 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002703 return rc;
2704
Alex Elder0f308a32012-08-29 17:11:07 -05002705err_out_disk:
2706 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002707err_out_blkdev:
2708 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002709err_out_id:
2710 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002711err_out_header:
2712 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002713err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002714 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002715 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002716 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002717err_out_args:
2718 kfree(rbd_dev->mapping.snap_name);
2719 kfree(rbd_dev->image_name);
2720 kfree(rbd_dev->pool_name);
2721err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002722 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002723 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002724
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002725 dout("Error adding device %s\n", buf);
2726 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002727
2728 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002729}
2730
Alex Elderde71a292012-07-03 16:01:19 -05002731static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002732{
2733 struct list_head *tmp;
2734 struct rbd_device *rbd_dev;
2735
Alex Eldere124a822012-01-29 13:57:44 -06002736 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002737 list_for_each(tmp, &rbd_dev_list) {
2738 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002739 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002740 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002741 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002742 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002743 }
Alex Eldere124a822012-01-29 13:57:44 -06002744 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002745 return NULL;
2746}
2747
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002748static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002749{
Alex Elder593a9e72012-02-07 12:03:37 -06002750 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002751
Alex Elder1dbb4392012-01-24 10:08:37 -06002752 if (rbd_dev->watch_request) {
2753 struct ceph_client *client = rbd_dev->rbd_client->client;
2754
2755 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002756 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002757 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002758 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002759 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002760
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761 rbd_put_client(rbd_dev);
2762
2763 /* clean up and free blkdev */
2764 rbd_free_disk(rbd_dev);
2765 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002766
Alex Elder2ac4e752012-07-10 20:30:10 -05002767 /* release allocated disk header fields */
2768 rbd_header_free(&rbd_dev->header);
2769
Alex Elder32eec682012-02-08 16:11:14 -06002770 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002771 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05002772 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05002773 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002774 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002775 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002776 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002777 kfree(rbd_dev);
2778
2779 /* release module ref */
2780 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002781}
2782
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002783static ssize_t rbd_remove(struct bus_type *bus,
2784 const char *buf,
2785 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002786{
2787 struct rbd_device *rbd_dev = NULL;
2788 int target_id, rc;
2789 unsigned long ul;
2790 int ret = count;
2791
2792 rc = strict_strtoul(buf, 10, &ul);
2793 if (rc)
2794 return rc;
2795
2796 /* convert to int; abort if we lost anything in the conversion */
2797 target_id = (int) ul;
2798 if (target_id != ul)
2799 return -EINVAL;
2800
2801 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2802
2803 rbd_dev = __rbd_get_dev(target_id);
2804 if (!rbd_dev) {
2805 ret = -ENOENT;
2806 goto done;
2807 }
2808
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002809 __rbd_remove_all_snaps(rbd_dev);
2810 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002811
2812done:
2813 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05002814
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002815 return ret;
2816}
2817
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002818/*
2819 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002820 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002821 */
2822static int rbd_sysfs_init(void)
2823{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002824 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002825
Alex Elderfed4c142012-02-07 12:03:36 -06002826 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002827 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002828 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002829
Alex Elderfed4c142012-02-07 12:03:36 -06002830 ret = bus_register(&rbd_bus_type);
2831 if (ret < 0)
2832 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002833
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002834 return ret;
2835}
2836
2837static void rbd_sysfs_cleanup(void)
2838{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002839 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002840 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002841}
2842
2843int __init rbd_init(void)
2844{
2845 int rc;
2846
2847 rc = rbd_sysfs_init();
2848 if (rc)
2849 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002850 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002851 return 0;
2852}
2853
2854void __exit rbd_exit(void)
2855{
2856 rbd_sysfs_cleanup();
2857}
2858
2859module_init(rbd_init);
2860module_exit(rbd_exit);
2861
2862MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2863MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2864MODULE_DESCRIPTION("rados block device");
2865
2866/* following authorship retained from original osdblk.c */
2867MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2868
2869MODULE_LICENSE("GPL");