blob: eb6b7723906bf0e3de2e0a953068aa3ffce08a69 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder81a89792012-02-02 08:13:30 -060069/*
70 * An RBD device name will be "rbd#", where the "rbd" comes from
71 * RBD_DRV_NAME above, and # is a unique integer identifier.
72 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
73 * enough to hold all possible device names.
74 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060076#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077
Alex Eldercc0538b2012-08-10 13:12:07 -070078#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070079
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080/*
81 * block device image metadata (in-memory version)
82 */
83struct rbd_image_header {
84 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050085 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070086 __u8 obj_order;
87 __u8 crypt_type;
88 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070089 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090
91 char *snap_names;
92 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070093
94 u64 obj_version;
95};
96
97struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -070098 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099};
100
101/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600102 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 */
104struct rbd_client {
105 struct ceph_client *client;
106 struct kref kref;
107 struct list_head node;
108};
109
110/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600111 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700113struct rbd_req_status {
114 int done;
115 int rc;
116 u64 bytes;
117};
118
119/*
120 * a collection of requests
121 */
122struct rbd_req_coll {
123 int total;
124 int num_done;
125 struct kref kref;
126 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700127};
128
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129/*
130 * a single io request
131 */
132struct rbd_request {
133 struct request *rq; /* blk layer request */
134 struct bio *bio; /* cloned bio */
135 struct page **pages; /* list of used pages */
136 u64 len;
137 int coll_index;
138 struct rbd_req_coll *coll;
139};
140
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800141struct rbd_snap {
142 struct device dev;
143 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800144 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800145 struct list_head node;
146 u64 id;
147};
148
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700149/*
150 * a single device
151 */
152struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500153 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154
155 int major; /* blkdev assigned major */
156 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157
Alex Elderf8c38922012-08-10 13:12:07 -0700158 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159 struct rbd_client *rbd_client;
160
161 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
162
163 spinlock_t lock; /* queue lock */
164
165 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500166 char *image_name;
167 size_t image_name_len;
168 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500169 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500170 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700171
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700172 struct ceph_osd_event *watch_event;
173 struct ceph_osd_request *watch_request;
174
Josh Durginc6666012011-11-21 17:11:12 -0800175 /* protects updating the header */
176 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500178 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800179 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800180 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800181 /* whether the snap_id this device reads from still exists */
182 bool snap_exists;
Alex Eldercc0538b2012-08-10 13:12:07 -0700183 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184
185 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800186
187 /* list of snapshots */
188 struct list_head snaps;
189
190 /* sysfs related */
191 struct device dev;
192};
193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600195
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600197static DEFINE_SPINLOCK(rbd_dev_list_lock);
198
Alex Elder432b8582012-01-29 13:57:44 -0600199static LIST_HEAD(rbd_client_list); /* clients */
200static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201
Alex Elder9fcbb802012-08-23 23:48:49 -0500202static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800203static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204static ssize_t rbd_snap_add(struct device *dev,
205 struct device_attribute *attr,
206 const char *buf,
207 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500208static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209
Alex Elderf0f8cef2012-01-29 13:57:44 -0600210static ssize_t rbd_add(struct bus_type *bus, const char *buf,
211 size_t count);
212static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
213 size_t count);
214
215static struct bus_attribute rbd_bus_attrs[] = {
216 __ATTR(add, S_IWUSR, NULL, rbd_add),
217 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
218 __ATTR_NULL
219};
220
221static struct bus_type rbd_bus_type = {
222 .name = "rbd",
223 .bus_attrs = rbd_bus_attrs,
224};
225
226static void rbd_root_dev_release(struct device *dev)
227{
228}
229
230static struct device rbd_root_dev = {
231 .init_name = "rbd",
232 .release = rbd_root_dev_release,
233};
234
Alex Elderaafb2302012-09-06 16:00:54 -0500235#ifdef RBD_DEBUG
236#define rbd_assert(expr) \
237 if (unlikely(!(expr))) { \
238 printk(KERN_ERR "\nAssertion failure in %s() " \
239 "at line %d:\n\n" \
240 "\trbd_assert(%s);\n\n", \
241 __func__, __LINE__, #expr); \
242 BUG(); \
243 }
244#else /* !RBD_DEBUG */
245# define rbd_assert(expr) ((void) 0)
246#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800247
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800248static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
249{
250 return get_device(&rbd_dev->dev);
251}
252
253static void rbd_put_dev(struct rbd_device *rbd_dev)
254{
255 put_device(&rbd_dev->dev);
256}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700257
Alex Elder1fe5e992012-07-25 09:32:41 -0500258static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700259
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260static int rbd_open(struct block_device *bdev, fmode_t mode)
261{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600262 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700263
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
265 return -EROFS;
266
Alex Elder340c7a22012-08-10 13:12:07 -0700267 rbd_get_dev(rbd_dev);
268 set_device_ro(bdev, rbd_dev->read_only);
269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270 return 0;
271}
272
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800273static int rbd_release(struct gendisk *disk, fmode_t mode)
274{
275 struct rbd_device *rbd_dev = disk->private_data;
276
277 rbd_put_dev(rbd_dev);
278
279 return 0;
280}
281
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282static const struct block_device_operations rbd_bd_ops = {
283 .owner = THIS_MODULE,
284 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800285 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700286};
287
288/*
289 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500290 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291 */
Alex Elderf8c38922012-08-10 13:12:07 -0700292static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293{
294 struct rbd_client *rbdc;
295 int ret = -ENOMEM;
296
297 dout("rbd_client_create\n");
298 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
299 if (!rbdc)
300 goto out_opt;
301
302 kref_init(&rbdc->kref);
303 INIT_LIST_HEAD(&rbdc->node);
304
Alex Elderbc534d862012-01-29 13:57:44 -0600305 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
306
Alex Elder43ae4702012-07-03 16:01:18 -0500307 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600309 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500310 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311
312 ret = ceph_open_session(rbdc->client);
313 if (ret < 0)
314 goto out_err;
315
Alex Elder432b8582012-01-29 13:57:44 -0600316 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600318 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319
Alex Elderbc534d862012-01-29 13:57:44 -0600320 mutex_unlock(&ctl_mutex);
321
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322 dout("rbd_client_create created %p\n", rbdc);
323 return rbdc;
324
325out_err:
326 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600327out_mutex:
328 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329 kfree(rbdc);
330out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts)
332 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400333 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334}
335
336/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700337 * Find a ceph client with specific addr and configuration. If
338 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700340static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341{
342 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700343 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elder43ae4702012-07-03 16:01:18 -0500345 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346 return NULL;
347
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 spin_lock(&rbd_client_list_lock);
349 list_for_each_entry(client_node, &rbd_client_list, node) {
350 if (!ceph_compare_options(ceph_opts, client_node->client)) {
351 kref_get(&client_node->kref);
352 found = true;
353 break;
354 }
355 }
356 spin_unlock(&rbd_client_list_lock);
357
358 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700362 * mount options
363 */
364enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 Opt_last_int,
366 /* int args above */
367 Opt_last_string,
368 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700369 Opt_read_only,
370 Opt_read_write,
371 /* Boolean args above */
372 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700373};
374
Alex Elder43ae4702012-07-03 16:01:18 -0500375static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376 /* int args above */
377 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700378 {Opt_read_only, "read_only"},
379 {Opt_read_only, "ro"}, /* Alternate spelling */
380 {Opt_read_write, "read_write"},
381 {Opt_read_write, "rw"}, /* Alternate spelling */
382 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700383 {-1, NULL}
384};
385
386static int parse_rbd_opts_token(char *c, void *private)
387{
Alex Elder43ae4702012-07-03 16:01:18 -0500388 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700389 substring_t argstr[MAX_OPT_ARGS];
390 int token, intval, ret;
391
Alex Elder43ae4702012-07-03 16:01:18 -0500392 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700393 if (token < 0)
394 return -EINVAL;
395
396 if (token < Opt_last_int) {
397 ret = match_int(&argstr[0], &intval);
398 if (ret < 0) {
399 pr_err("bad mount option arg (not int) "
400 "at '%s'\n", c);
401 return ret;
402 }
403 dout("got int token %d val %d\n", token, intval);
404 } else if (token > Opt_last_int && token < Opt_last_string) {
405 dout("got string token %d val %s\n", token,
406 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700407 } else if (token > Opt_last_string && token < Opt_last_bool) {
408 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700409 } else {
410 dout("got token %d\n", token);
411 }
412
413 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700414 case Opt_read_only:
415 rbd_opts->read_only = true;
416 break;
417 case Opt_read_write:
418 rbd_opts->read_only = false;
419 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700420 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500421 rbd_assert(false);
422 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700423 }
424 return 0;
425}
426
427/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428 * Get a ceph client with specific addr and configuration, if one does
429 * not exist create it.
430 */
Alex Elderf8c38922012-08-10 13:12:07 -0700431static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
432 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433{
Alex Elderf8c38922012-08-10 13:12:07 -0700434 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500435 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700436 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700437
Alex Eldercc0538b2012-08-10 13:12:07 -0700438 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elder43ae4702012-07-03 16:01:18 -0500440 ceph_opts = ceph_parse_options(options, mon_addr,
441 mon_addr + mon_addr_len,
442 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700443 if (IS_ERR(ceph_opts))
444 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445
Alex Elder1f7ba332012-08-10 13:12:07 -0700446 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600448 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500449 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700450 } else {
451 rbdc = rbd_client_create(ceph_opts);
452 if (IS_ERR(rbdc))
453 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454 }
Alex Elderf8c38922012-08-10 13:12:07 -0700455 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
Alex Elderf8c38922012-08-10 13:12:07 -0700457 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458}
459
460/*
461 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600462 *
Alex Elder432b8582012-01-29 13:57:44 -0600463 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464 */
465static void rbd_client_release(struct kref *kref)
466{
467 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
468
469 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500470 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500472 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473
474 ceph_destroy_client(rbdc->client);
475 kfree(rbdc);
476}
477
478/*
479 * Drop reference to ceph client node. If it's not referenced anymore, release
480 * it.
481 */
482static void rbd_put_client(struct rbd_device *rbd_dev)
483{
484 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
485 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486}
487
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700488/*
489 * Destroy requests collection
490 */
491static void rbd_coll_release(struct kref *kref)
492{
493 struct rbd_req_coll *coll =
494 container_of(kref, struct rbd_req_coll, kref);
495
496 dout("rbd_coll_release %p\n", coll);
497 kfree(coll);
498}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700499
Alex Elder8e94af82012-07-25 09:32:40 -0500500static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
501{
Alex Elder103a1502012-08-02 11:29:45 -0500502 size_t size;
503 u32 snap_count;
504
505 /* The header has to start with the magic rbd header text */
506 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
507 return false;
508
509 /*
510 * The size of a snapshot header has to fit in a size_t, and
511 * that limits the number of snapshots.
512 */
513 snap_count = le32_to_cpu(ondisk->snap_count);
514 size = SIZE_MAX - sizeof (struct ceph_snap_context);
515 if (snap_count > size / sizeof (__le64))
516 return false;
517
518 /*
519 * Not only that, but the size of the entire the snapshot
520 * header must also be representable in a size_t.
521 */
522 size -= snap_count * sizeof (__le64);
523 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
524 return false;
525
526 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500527}
528
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700529/*
530 * Create a new header structure, translate header format from the on-disk
531 * header.
532 */
533static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500534 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535{
Alex Elderccece232012-07-10 20:30:10 -0500536 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500537 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500538 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500539 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540
Alex Elder6a523252012-07-19 17:12:59 -0500541 memset(header, 0, sizeof (*header));
542
Alex Elder103a1502012-08-02 11:29:45 -0500543 snap_count = le32_to_cpu(ondisk->snap_count);
544
Alex Elder58c17b02012-08-23 23:22:06 -0500545 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
546 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500547 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500549 memcpy(header->object_prefix, ondisk->object_prefix, len);
550 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600551
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500553 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
554
Alex Elder621901d2012-08-23 23:22:06 -0500555 /* Save a copy of the snapshot names */
556
Alex Elderf785cc12012-08-23 23:22:06 -0500557 if (snap_names_len > (u64) SIZE_MAX)
558 return -EIO;
559 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700560 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500561 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500562 /*
563 * Note that rbd_dev_v1_header_read() guarantees
564 * the ondisk buffer we're working with has
565 * snap_names_len bytes beyond the end of the
566 * snapshot id array, this memcpy() is safe.
567 */
568 memcpy(header->snap_names, &ondisk->snaps[snap_count],
569 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500570
Alex Elder621901d2012-08-23 23:22:06 -0500571 /* Record each snapshot's size */
572
Alex Elderd2bb24e2012-07-26 23:37:14 -0500573 size = snap_count * sizeof (*header->snap_sizes);
574 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500576 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500577 for (i = 0; i < snap_count; i++)
578 header->snap_sizes[i] =
579 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700580 } else {
Alex Elderccece232012-07-10 20:30:10 -0500581 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 header->snap_names = NULL;
583 header->snap_sizes = NULL;
584 }
Alex Elder849b4262012-07-09 21:04:24 -0500585
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586 header->image_size = le64_to_cpu(ondisk->image_size);
587 header->obj_order = ondisk->options.order;
588 header->crypt_type = ondisk->options.crypt_type;
589 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500590
Alex Elder621901d2012-08-23 23:22:06 -0500591 /* Allocate and fill in the snapshot context */
592
Alex Elder6a523252012-07-19 17:12:59 -0500593 size = sizeof (struct ceph_snap_context);
594 size += snap_count * sizeof (header->snapc->snaps[0]);
595 header->snapc = kzalloc(size, GFP_KERNEL);
596 if (!header->snapc)
597 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700598
599 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500600 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500602 for (i = 0; i < snap_count; i++)
603 header->snapc->snaps[i] =
604 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605
606 return 0;
607
Alex Elder6a523252012-07-19 17:12:59 -0500608out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500609 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500610 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500612 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500613 kfree(header->object_prefix);
614 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500615
Alex Elder00f1f362012-02-07 12:03:36 -0600616 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617}
618
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
620 u64 *seq, u64 *size)
621{
622 int i;
623 char *p = header->snap_names;
624
Alex Elderc9aadfe2012-08-30 14:42:15 -0500625 rbd_assert(header->snapc != NULL);
626 for (i = 0; i < header->snapc->num_snaps; i++) {
Alex Elder00f1f362012-02-07 12:03:36 -0600627 if (!strcmp(snap_name, p)) {
628
629 /* Found it. Pass back its id and/or size */
630
631 if (seq)
632 *seq = header->snapc->snaps[i];
633 if (size)
634 *size = header->snap_sizes[i];
635 return i;
636 }
637 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 }
Alex Elder00f1f362012-02-07 12:03:36 -0600639 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640}
641
Alex Elder0ce1a792012-07-03 16:01:18 -0500642static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643{
Alex Elder78dc4472012-07-19 08:49:18 -0500644 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645
Alex Elder0ce1a792012-07-03 16:01:18 -0500646 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647
Alex Elder0ce1a792012-07-03 16:01:18 -0500648 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800649 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500650 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800651 rbd_dev->snap_exists = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700652 rbd_dev->read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500654 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500656 u64 snap_id = 0;
657
658 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
659 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 if (ret < 0)
661 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500662 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800663 rbd_dev->snap_exists = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700664 rbd_dev->read_only = true; /* No choice for snapshots */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 }
666
667 ret = 0;
668done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500669 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670 return ret;
671}
672
673static void rbd_header_free(struct rbd_image_header *header)
674{
Alex Elder849b4262012-07-09 21:04:24 -0500675 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500676 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500678 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500679 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500680 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800681 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500682 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683}
684
Alex Elder65ccfe22012-08-09 10:33:26 -0700685static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686{
Alex Elder65ccfe22012-08-09 10:33:26 -0700687 char *name;
688 u64 segment;
689 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690
Alex Elder65ccfe22012-08-09 10:33:26 -0700691 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
692 if (!name)
693 return NULL;
694 segment = offset >> rbd_dev->header.obj_order;
695 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
696 rbd_dev->header.object_prefix, segment);
697 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
698 pr_err("error formatting segment name for #%llu (%d)\n",
699 segment, ret);
700 kfree(name);
701 name = NULL;
702 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703
Alex Elder65ccfe22012-08-09 10:33:26 -0700704 return name;
705}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706
Alex Elder65ccfe22012-08-09 10:33:26 -0700707static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
708{
709 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710
Alex Elder65ccfe22012-08-09 10:33:26 -0700711 return offset & (segment_size - 1);
712}
713
714static u64 rbd_segment_length(struct rbd_device *rbd_dev,
715 u64 offset, u64 length)
716{
717 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
718
719 offset &= segment_size - 1;
720
Alex Elderaafb2302012-09-06 16:00:54 -0500721 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700722 if (offset + length > segment_size)
723 length = segment_size - offset;
724
725 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726}
727
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700728static int rbd_get_num_segments(struct rbd_image_header *header,
729 u64 ofs, u64 len)
730{
Alex Elderdf111be2012-08-09 10:33:26 -0700731 u64 start_seg;
732 u64 end_seg;
733
734 if (!len)
735 return 0;
736 if (len - 1 > U64_MAX - ofs)
737 return -ERANGE;
738
739 start_seg = ofs >> header->obj_order;
740 end_seg = (ofs + len - 1) >> header->obj_order;
741
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700742 return end_seg - start_seg + 1;
743}
744
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700746 * returns the size of an object in the image
747 */
748static u64 rbd_obj_bytes(struct rbd_image_header *header)
749{
750 return 1 << header->obj_order;
751}
752
753/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754 * bio helpers
755 */
756
757static void bio_chain_put(struct bio *chain)
758{
759 struct bio *tmp;
760
761 while (chain) {
762 tmp = chain;
763 chain = chain->bi_next;
764 bio_put(tmp);
765 }
766}
767
768/*
769 * zeros a bio chain, starting at specific offset
770 */
771static void zero_bio_chain(struct bio *chain, int start_ofs)
772{
773 struct bio_vec *bv;
774 unsigned long flags;
775 void *buf;
776 int i;
777 int pos = 0;
778
779 while (chain) {
780 bio_for_each_segment(bv, chain, i) {
781 if (pos + bv->bv_len > start_ofs) {
782 int remainder = max(start_ofs - pos, 0);
783 buf = bvec_kmap_irq(bv, &flags);
784 memset(buf + remainder, 0,
785 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200786 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 }
788 pos += bv->bv_len;
789 }
790
791 chain = chain->bi_next;
792 }
793}
794
795/*
796 * bio_chain_clone - clone a chain of bios up to a certain length.
797 * might return a bio_pair that will need to be released.
798 */
799static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
800 struct bio_pair **bp,
801 int len, gfp_t gfpmask)
802{
Alex Elder542582f2012-08-09 10:33:25 -0700803 struct bio *old_chain = *old;
804 struct bio *new_chain = NULL;
805 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700806 int total = 0;
807
808 if (*bp) {
809 bio_pair_release(*bp);
810 *bp = NULL;
811 }
812
813 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700814 struct bio *tmp;
815
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
817 if (!tmp)
818 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700819 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820
821 if (total + old_chain->bi_size > len) {
822 struct bio_pair *bp;
823
824 /*
825 * this split can only happen with a single paged bio,
826 * split_bio will BUG_ON if this is not the case
827 */
828 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500829 "bi_size=%u\n",
830 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700831
832 /* split the bio. We'll release it either in the next
833 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600834 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835 if (!bp)
836 goto err_out;
837
838 __bio_clone(tmp, &bp->bio1);
839
840 *next = &bp->bio2;
841 } else {
842 __bio_clone(tmp, old_chain);
843 *next = old_chain->bi_next;
844 }
845
846 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700847 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700848 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700849 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700850 else
851 new_chain = tmp;
852 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853 old_chain = old_chain->bi_next;
854
855 total += tmp->bi_size;
856 }
857
Alex Elderaafb2302012-09-06 16:00:54 -0500858 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860 *old = old_chain;
861
862 return new_chain;
863
864err_out:
865 dout("bio_chain_clone with err\n");
866 bio_chain_put(new_chain);
867 return NULL;
868}
869
870/*
871 * helpers for osd request op vectors.
872 */
Alex Elder57cfc102012-06-26 12:57:03 -0700873static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
874 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875{
Alex Elder57cfc102012-06-26 12:57:03 -0700876 struct ceph_osd_req_op *ops;
877
878 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
879 if (!ops)
880 return NULL;
881
882 ops[0].op = opcode;
883
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 /*
885 * op extent offset and length will be set later on
886 * in calc_raw_layout()
887 */
Alex Elder57cfc102012-06-26 12:57:03 -0700888 ops[0].payload_len = payload_len;
889
890 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891}
892
893static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
894{
895 kfree(ops);
896}
897
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700898static void rbd_coll_end_req_index(struct request *rq,
899 struct rbd_req_coll *coll,
900 int index,
901 int ret, u64 len)
902{
903 struct request_queue *q;
904 int min, max, i;
905
Alex Elderbd919d42012-07-13 20:35:11 -0500906 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
907 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700908
909 if (!rq)
910 return;
911
912 if (!coll) {
913 blk_end_request(rq, ret, len);
914 return;
915 }
916
917 q = rq->q;
918
919 spin_lock_irq(q->queue_lock);
920 coll->status[index].done = 1;
921 coll->status[index].rc = ret;
922 coll->status[index].bytes = len;
923 max = min = coll->num_done;
924 while (max < coll->total && coll->status[max].done)
925 max++;
926
927 for (i = min; i<max; i++) {
928 __blk_end_request(rq, coll->status[i].rc,
929 coll->status[i].bytes);
930 coll->num_done++;
931 kref_put(&coll->kref, rbd_coll_release);
932 }
933 spin_unlock_irq(q->queue_lock);
934}
935
936static void rbd_coll_end_req(struct rbd_request *req,
937 int ret, u64 len)
938{
939 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
940}
941
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942/*
943 * Send ceph osd request
944 */
945static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500946 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 struct ceph_snap_context *snapc,
948 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500949 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950 struct bio *bio,
951 struct page **pages,
952 int num_pages,
953 int flags,
954 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700955 struct rbd_req_coll *coll,
956 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700957 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700958 struct ceph_msg *msg),
959 struct ceph_osd_request **linger_req,
960 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961{
962 struct ceph_osd_request *req;
963 struct ceph_file_layout *layout;
964 int ret;
965 u64 bno;
966 struct timespec mtime = CURRENT_TIME;
967 struct rbd_request *req_data;
968 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600969 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700970
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700972 if (!req_data) {
973 if (coll)
974 rbd_coll_end_req_index(rq, coll, coll_index,
975 -ENOMEM, len);
976 return -ENOMEM;
977 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700979 if (coll) {
980 req_data->coll = coll;
981 req_data->coll_index = coll_index;
982 }
983
Alex Elderbd919d42012-07-13 20:35:11 -0500984 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
985 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986
Alex Elder0ce1a792012-07-03 16:01:18 -0500987 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600988 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
989 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700990 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700991 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992 goto done_pages;
993 }
994
995 req->r_callback = rbd_cb;
996
997 req_data->rq = rq;
998 req_data->bio = bio;
999 req_data->pages = pages;
1000 req_data->len = len;
1001
1002 req->r_priv = req_data;
1003
1004 reqhead = req->r_request->front.iov_base;
1005 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1006
Alex Elderaded07e2012-07-03 16:01:18 -05001007 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008 req->r_oid_len = strlen(req->r_oid);
1009
1010 layout = &req->r_file_layout;
1011 memset(layout, 0, sizeof(*layout));
1012 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1013 layout->fl_stripe_count = cpu_to_le32(1);
1014 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001015 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001016 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1017 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018
1019 ceph_osdc_build_request(req, ofs, &len,
1020 ops,
1021 snapc,
1022 &mtime,
1023 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001025 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001026 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001027 *linger_req = req;
1028 }
1029
Alex Elder1dbb4392012-01-24 10:08:37 -06001030 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031 if (ret < 0)
1032 goto done_err;
1033
1034 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001035 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001036 if (ver)
1037 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001038 dout("reassert_ver=%llu\n",
1039 (unsigned long long)
1040 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041 ceph_osdc_put_request(req);
1042 }
1043 return ret;
1044
1045done_err:
1046 bio_chain_put(req_data->bio);
1047 ceph_osdc_put_request(req);
1048done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001049 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001051 return ret;
1052}
1053
1054/*
1055 * Ceph osd op callback
1056 */
1057static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1058{
1059 struct rbd_request *req_data = req->r_priv;
1060 struct ceph_osd_reply_head *replyhead;
1061 struct ceph_osd_op *op;
1062 __s32 rc;
1063 u64 bytes;
1064 int read_op;
1065
1066 /* parse reply */
1067 replyhead = msg->front.iov_base;
1068 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1069 op = (void *)(replyhead + 1);
1070 rc = le32_to_cpu(replyhead->result);
1071 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001072 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073
Alex Elderbd919d42012-07-13 20:35:11 -05001074 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1075 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076
1077 if (rc == -ENOENT && read_op) {
1078 zero_bio_chain(req_data->bio, 0);
1079 rc = 0;
1080 } else if (rc == 0 && read_op && bytes < req_data->len) {
1081 zero_bio_chain(req_data->bio, bytes);
1082 bytes = req_data->len;
1083 }
1084
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001085 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086
1087 if (req_data->bio)
1088 bio_chain_put(req_data->bio);
1089
1090 ceph_osdc_put_request(req);
1091 kfree(req_data);
1092}
1093
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001094static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1095{
1096 ceph_osdc_put_request(req);
1097}
1098
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099/*
1100 * Do a synchronous ceph osd operation
1101 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001102static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103 struct ceph_snap_context *snapc,
1104 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001106 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001107 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001109 char *buf,
1110 struct ceph_osd_request **linger_req,
1111 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112{
1113 int ret;
1114 struct page **pages;
1115 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001116
Alex Elderaafb2302012-09-06 16:00:54 -05001117 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118
1119 num_pages = calc_pages_for(ofs , len);
1120 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001121 if (IS_ERR(pages))
1122 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123
Alex Elder0ce1a792012-07-03 16:01:18 -05001124 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001125 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126 pages, num_pages,
1127 flags,
1128 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001129 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001130 NULL,
1131 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001133 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134
1135 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1136 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1137
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138done:
1139 ceph_release_page_vector(pages, num_pages);
1140 return ret;
1141}
1142
1143/*
1144 * Do an asynchronous ceph osd operation
1145 */
1146static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001147 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148 struct ceph_snap_context *snapc,
1149 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001150 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001151 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001152 struct bio *bio,
1153 struct rbd_req_coll *coll,
1154 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001155{
1156 char *seg_name;
1157 u64 seg_ofs;
1158 u64 seg_len;
1159 int ret;
1160 struct ceph_osd_req_op *ops;
1161 u32 payload_len;
1162
Alex Elder65ccfe22012-08-09 10:33:26 -07001163 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164 if (!seg_name)
1165 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001166 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1167 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168
1169 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1170
Alex Elder57cfc102012-06-26 12:57:03 -07001171 ret = -ENOMEM;
1172 ops = rbd_create_rw_ops(1, opcode, payload_len);
1173 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174 goto done;
1175
1176 /* we've taken care of segment sizes earlier when we
1177 cloned the bios. We should never have a segment
1178 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001179 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180
1181 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1182 seg_name, seg_ofs, seg_len,
1183 bio,
1184 NULL, 0,
1185 flags,
1186 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001187 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001188 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001189
1190 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191done:
1192 kfree(seg_name);
1193 return ret;
1194}
1195
1196/*
1197 * Request async osd write
1198 */
1199static int rbd_req_write(struct request *rq,
1200 struct rbd_device *rbd_dev,
1201 struct ceph_snap_context *snapc,
1202 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001203 struct bio *bio,
1204 struct rbd_req_coll *coll,
1205 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206{
1207 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1208 CEPH_OSD_OP_WRITE,
1209 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001210 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211}
1212
1213/*
1214 * Request async osd read
1215 */
1216static int rbd_req_read(struct request *rq,
1217 struct rbd_device *rbd_dev,
1218 u64 snapid,
1219 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001220 struct bio *bio,
1221 struct rbd_req_coll *coll,
1222 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223{
1224 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001225 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226 CEPH_OSD_OP_READ,
1227 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001228 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229}
1230
1231/*
1232 * Request sync osd read
1233 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001234static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001236 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 char *buf,
1239 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240{
Alex Elder913d2fd2012-06-26 12:57:03 -07001241 struct ceph_osd_req_op *ops;
1242 int ret;
1243
1244 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1245 if (!ops)
1246 return -ENOMEM;
1247
1248 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001249 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001251 ops, object_name, ofs, len, buf, NULL, ver);
1252 rbd_destroy_ops(ops);
1253
1254 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255}
1256
1257/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 * Request sync osd watch
1259 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001260static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001262 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001263{
1264 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001265 int ret;
1266
Alex Elder57cfc102012-06-26 12:57:03 -07001267 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1268 if (!ops)
1269 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270
Josh Durgina71b8912011-12-05 18:10:44 -08001271 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272 ops[0].watch.cookie = notify_id;
1273 ops[0].watch.flag = 0;
1274
Alex Elder0ce1a792012-07-03 16:01:18 -05001275 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001276 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001277 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 CEPH_OSD_FLAG_READ,
1279 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001280 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281 rbd_simple_req_cb, 0, NULL);
1282
1283 rbd_destroy_ops(ops);
1284 return ret;
1285}
1286
1287static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1288{
Alex Elder0ce1a792012-07-03 16:01:18 -05001289 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001290 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001291 int rc;
1292
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001294 return;
1295
Alex Elderbd919d42012-07-13 20:35:11 -05001296 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1297 rbd_dev->header_name, (unsigned long long) notify_id,
1298 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001299 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001300 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001301 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303
Alex Elder7f0a24d2012-07-25 09:32:40 -05001304 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001305}
1306
1307/*
1308 * Request sync osd watch
1309 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001310static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001311{
1312 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001313 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001314 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315
Alex Elder57cfc102012-06-26 12:57:03 -07001316 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1317 if (!ops)
1318 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001319
1320 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001321 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001322 if (ret < 0)
1323 goto fail;
1324
Alex Elder0e6f3222012-07-25 09:32:40 -05001325 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 ops[0].watch.flag = 1;
1328
Alex Elder0ce1a792012-07-03 16:01:18 -05001329 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001330 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001331 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1332 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001333 rbd_dev->header_name,
1334 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001335 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336
1337 if (ret < 0)
1338 goto fail_event;
1339
1340 rbd_destroy_ops(ops);
1341 return 0;
1342
1343fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001344 ceph_osdc_cancel_event(rbd_dev->watch_event);
1345 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346fail:
1347 rbd_destroy_ops(ops);
1348 return ret;
1349}
1350
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001351/*
1352 * Request sync osd unwatch
1353 */
Alex Elder070c6332012-07-25 09:32:41 -05001354static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001355{
1356 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001357 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001358
Alex Elder57cfc102012-06-26 12:57:03 -07001359 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1360 if (!ops)
1361 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362
1363 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001364 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365 ops[0].watch.flag = 0;
1366
Alex Elder0ce1a792012-07-03 16:01:18 -05001367 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001369 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1370 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001371 rbd_dev->header_name,
1372 0, 0, NULL, NULL, NULL);
1373
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001374
1375 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001376 ceph_osdc_cancel_event(rbd_dev->watch_event);
1377 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001378 return ret;
1379}
1380
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383};
1384
1385static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1386{
Alex Elder0ce1a792012-07-03 16:01:18 -05001387 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1388 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 return;
1390
Alex Elderbd919d42012-07-13 20:35:11 -05001391 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1392 rbd_dev->header_name, (unsigned long long) notify_id,
1393 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394}
1395
1396/*
1397 * Request sync osd notify
1398 */
Alex Elder4cb16252012-07-25 09:32:40 -05001399static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400{
1401 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001402 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001403 struct ceph_osd_event *event;
1404 struct rbd_notify_info info;
1405 int payload_len = sizeof(u32) + sizeof(u32);
1406 int ret;
1407
Alex Elder57cfc102012-06-26 12:57:03 -07001408 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1409 if (!ops)
1410 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001411
Alex Elder0ce1a792012-07-03 16:01:18 -05001412 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001413
1414 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1415 (void *)&info, &event);
1416 if (ret < 0)
1417 goto fail;
1418
1419 ops[0].watch.ver = 1;
1420 ops[0].watch.flag = 1;
1421 ops[0].watch.cookie = event->cookie;
1422 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1423 ops[0].watch.timeout = 12;
1424
Alex Elder0ce1a792012-07-03 16:01:18 -05001425 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001426 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001427 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1428 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001429 rbd_dev->header_name,
1430 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001431 if (ret < 0)
1432 goto fail_event;
1433
1434 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1435 dout("ceph_osdc_wait_event returned %d\n", ret);
1436 rbd_destroy_ops(ops);
1437 return 0;
1438
1439fail_event:
1440 ceph_osdc_cancel_event(event);
1441fail:
1442 rbd_destroy_ops(ops);
1443 return ret;
1444}
1445
1446/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447 * Request sync osd read
1448 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001449static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001450 const char *object_name,
1451 const char *class_name,
1452 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001453 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001454 int len,
1455 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456{
1457 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001458 int class_name_len = strlen(class_name);
1459 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001460 int ret;
1461
1462 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001463 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001464 if (!ops)
1465 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001466
Alex Elderaded07e2012-07-03 16:01:18 -05001467 ops[0].cls.class_name = class_name;
1468 ops[0].cls.class_len = (__u8) class_name_len;
1469 ops[0].cls.method_name = method_name;
1470 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471 ops[0].cls.argc = 0;
1472 ops[0].cls.indata = data;
1473 ops[0].cls.indata_len = len;
1474
Alex Elder0ce1a792012-07-03 16:01:18 -05001475 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1478 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001479 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480
1481 rbd_destroy_ops(ops);
1482
1483 dout("cls_exec returned %d\n", ret);
1484 return ret;
1485}
1486
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1488{
1489 struct rbd_req_coll *coll =
1490 kzalloc(sizeof(struct rbd_req_coll) +
1491 sizeof(struct rbd_req_status) * num_reqs,
1492 GFP_ATOMIC);
1493
1494 if (!coll)
1495 return NULL;
1496 coll->total = num_reqs;
1497 kref_init(&coll->kref);
1498 return coll;
1499}
1500
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501/*
1502 * block device queue callback
1503 */
1504static void rbd_rq_fn(struct request_queue *q)
1505{
1506 struct rbd_device *rbd_dev = q->queuedata;
1507 struct request *rq;
1508 struct bio_pair *bp = NULL;
1509
Alex Elder00f1f362012-02-07 12:03:36 -06001510 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 struct bio *bio;
1512 struct bio *rq_bio, *next_bio = NULL;
1513 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001514 unsigned int size;
1515 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001517 int num_segs, cur_seg = 0;
1518 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001519 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 dout("fetched request\n");
1522
1523 /* filter out block requests we don't understand */
1524 if ((rq->cmd_type != REQ_TYPE_FS)) {
1525 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001526 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 }
1528
1529 /* deduce our operation (read, write) */
1530 do_write = (rq_data_dir(rq) == WRITE);
1531
1532 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001533 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 rq_bio = rq->bio;
1535 if (do_write && rbd_dev->read_only) {
1536 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001537 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 }
1539
1540 spin_unlock_irq(q->queue_lock);
1541
Josh Durgind1d25642011-12-05 14:03:05 -08001542 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001543
Josh Durgind1d25642011-12-05 14:03:05 -08001544 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001545 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001546 dout("request for non-existent snapshot");
1547 spin_lock_irq(q->queue_lock);
1548 __blk_end_request_all(rq, -ENXIO);
1549 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001550 }
1551
Josh Durgind1d25642011-12-05 14:03:05 -08001552 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1553
1554 up_read(&rbd_dev->header_rwsem);
1555
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 dout("%s 0x%x bytes at 0x%llx\n",
1557 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001558 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001559
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001560 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001561 if (num_segs <= 0) {
1562 spin_lock_irq(q->queue_lock);
1563 __blk_end_request_all(rq, num_segs);
1564 ceph_put_snap_context(snapc);
1565 continue;
1566 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001567 coll = rbd_alloc_coll(num_segs);
1568 if (!coll) {
1569 spin_lock_irq(q->queue_lock);
1570 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001571 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001572 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001573 }
1574
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 do {
1576 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001577 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001578 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001579 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001580 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1581 op_size, GFP_ATOMIC);
1582 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001583 rbd_coll_end_req_index(rq, coll, cur_seg,
1584 -ENOMEM, op_size);
1585 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001586 }
1587
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001588
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001589 /* init OSD command: write or read */
1590 if (do_write)
1591 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001592 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001593 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001594 op_size, bio,
1595 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 else
1597 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001598 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001600 op_size, bio,
1601 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001602
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001603next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001604 size -= op_size;
1605 ofs += op_size;
1606
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001607 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 rq_bio = next_bio;
1609 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001610 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001611
1612 if (bp)
1613 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001615
1616 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 }
1618}
1619
1620/*
1621 * a queue callback. Makes sure that we don't create a bio that spans across
1622 * multiple osd objects. One exception would be with a single page bios,
1623 * which we handle later at bio_chain_clone
1624 */
1625static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1626 struct bio_vec *bvec)
1627{
1628 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001629 unsigned int chunk_sectors;
1630 sector_t sector;
1631 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632 int max;
1633
Alex Elder593a9e72012-02-07 12:03:37 -06001634 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1635 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1636 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1637
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001639 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001640 if (max < 0)
1641 max = 0; /* bio_add cannot handle a negative return */
1642 if (max <= bvec->bv_len && bio_sectors == 0)
1643 return bvec->bv_len;
1644 return max;
1645}
1646
1647static void rbd_free_disk(struct rbd_device *rbd_dev)
1648{
1649 struct gendisk *disk = rbd_dev->disk;
1650
1651 if (!disk)
1652 return;
1653
1654 rbd_header_free(&rbd_dev->header);
1655
1656 if (disk->flags & GENHD_FL_UP)
1657 del_gendisk(disk);
1658 if (disk->queue)
1659 blk_cleanup_queue(disk->queue);
1660 put_disk(disk);
1661}
1662
1663/*
Alex Elder4156d992012-08-02 11:29:46 -05001664 * Read the complete header for the given rbd device.
1665 *
1666 * Returns a pointer to a dynamically-allocated buffer containing
1667 * the complete and validated header. Caller can pass the address
1668 * of a variable that will be filled in with the version of the
1669 * header object at the time it was read.
1670 *
1671 * Returns a pointer-coded errno if a failure occurs.
1672 */
1673static struct rbd_image_header_ondisk *
1674rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1675{
1676 struct rbd_image_header_ondisk *ondisk = NULL;
1677 u32 snap_count = 0;
1678 u64 names_size = 0;
1679 u32 want_count;
1680 int ret;
1681
1682 /*
1683 * The complete header will include an array of its 64-bit
1684 * snapshot ids, followed by the names of those snapshots as
1685 * a contiguous block of NUL-terminated strings. Note that
1686 * the number of snapshots could change by the time we read
1687 * it in, in which case we re-read it.
1688 */
1689 do {
1690 size_t size;
1691
1692 kfree(ondisk);
1693
1694 size = sizeof (*ondisk);
1695 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1696 size += names_size;
1697 ondisk = kmalloc(size, GFP_KERNEL);
1698 if (!ondisk)
1699 return ERR_PTR(-ENOMEM);
1700
1701 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1702 rbd_dev->header_name,
1703 0, size,
1704 (char *) ondisk, version);
1705
1706 if (ret < 0)
1707 goto out_err;
1708 if (WARN_ON((size_t) ret < size)) {
1709 ret = -ENXIO;
1710 pr_warning("short header read for image %s"
1711 " (want %zd got %d)\n",
1712 rbd_dev->image_name, size, ret);
1713 goto out_err;
1714 }
1715 if (!rbd_dev_ondisk_valid(ondisk)) {
1716 ret = -ENXIO;
1717 pr_warning("invalid header for image %s\n",
1718 rbd_dev->image_name);
1719 goto out_err;
1720 }
1721
1722 names_size = le64_to_cpu(ondisk->snap_names_len);
1723 want_count = snap_count;
1724 snap_count = le32_to_cpu(ondisk->snap_count);
1725 } while (snap_count != want_count);
1726
1727 return ondisk;
1728
1729out_err:
1730 kfree(ondisk);
1731
1732 return ERR_PTR(ret);
1733}
1734
1735/*
1736 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737 */
1738static int rbd_read_header(struct rbd_device *rbd_dev,
1739 struct rbd_image_header *header)
1740{
Alex Elder4156d992012-08-02 11:29:46 -05001741 struct rbd_image_header_ondisk *ondisk;
1742 u64 ver = 0;
1743 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744
Alex Elder4156d992012-08-02 11:29:46 -05001745 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1746 if (IS_ERR(ondisk))
1747 return PTR_ERR(ondisk);
1748 ret = rbd_header_from_disk(header, ondisk);
1749 if (ret >= 0)
1750 header->obj_version = ver;
1751 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Alex Elder4156d992012-08-02 11:29:46 -05001753 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754}
1755
1756/*
1757 * create a snapshot
1758 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001759static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760 const char *snap_name,
1761 gfp_t gfp_flags)
1762{
1763 int name_len = strlen(snap_name);
1764 u64 new_snapid;
1765 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001766 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001767 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768
1769 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001770 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001771 return -EINVAL;
1772
Alex Elder0ce1a792012-07-03 16:01:18 -05001773 monc = &rbd_dev->rbd_client->client->monc;
1774 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001775 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001776 if (ret < 0)
1777 return ret;
1778
1779 data = kmalloc(name_len + 16, gfp_flags);
1780 if (!data)
1781 return -ENOMEM;
1782
Sage Weil916d4d62011-05-12 16:10:50 -07001783 p = data;
1784 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785
Sage Weil916d4d62011-05-12 16:10:50 -07001786 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1787 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788
Alex Elder0bed54d2012-07-03 16:01:18 -05001789 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001790 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001791 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001792
Sage Weil916d4d62011-05-12 16:10:50 -07001793 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794
Alex Elder505cbb92012-07-19 08:49:18 -05001795 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001796bad:
1797 return -ERANGE;
1798}
1799
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001800static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1801{
1802 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001803 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001804
Alex Eldera0593292012-07-19 09:09:27 -05001805 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001806 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001807}
1808
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809/*
1810 * only read the first part of the ondisk header, without the snaps info
1811 */
Alex Elderb8136232012-07-25 09:32:41 -05001812static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813{
1814 int ret;
1815 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001816
1817 ret = rbd_read_header(rbd_dev, &h);
1818 if (ret < 0)
1819 return ret;
1820
Josh Durgina51aa0c2011-12-05 10:35:04 -08001821 down_write(&rbd_dev->header_rwsem);
1822
Sage Weil9db4b3e2011-04-19 22:49:06 -07001823 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001824 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1825 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1826
1827 dout("setting size to %llu sectors", (unsigned long long) size);
1828 set_capacity(rbd_dev->disk, size);
1829 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001830
Alex Elder849b4262012-07-09 21:04:24 -05001831 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001833 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001834 /* osd requests may still refer to snapc */
1835 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836
Alex Elderb8136232012-07-25 09:32:41 -05001837 if (hver)
1838 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001839 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001840 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001841 rbd_dev->header.snapc = h.snapc;
1842 rbd_dev->header.snap_names = h.snap_names;
1843 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001844 /* Free the extra copy of the object prefix */
1845 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1846 kfree(h.object_prefix);
1847
Alex Elder9fcbb802012-08-23 23:48:49 -05001848 ret = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001849
Josh Durginc6666012011-11-21 17:11:12 -08001850 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001851
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001852 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853}
1854
Alex Elder1fe5e992012-07-25 09:32:41 -05001855static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1856{
1857 int ret;
1858
1859 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1860 ret = __rbd_refresh_header(rbd_dev, hver);
1861 mutex_unlock(&ctl_mutex);
1862
1863 return ret;
1864}
1865
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866static int rbd_init_disk(struct rbd_device *rbd_dev)
1867{
1868 struct gendisk *disk;
1869 struct request_queue *q;
1870 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001871 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001872 u64 total_size = 0;
1873
1874 /* contact OSD, request size info about the object being mapped */
1875 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1876 if (rc)
1877 return rc;
1878
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879 /* no need to lock here, as rbd_dev is not registered yet */
Alex Elder9fcbb802012-08-23 23:48:49 -05001880 rc = rbd_dev_snap_devs_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881 if (rc)
1882 return rc;
1883
Josh Durgincc9d7342011-11-21 18:19:13 -08001884 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885 if (rc)
1886 return rc;
1887
1888 /* create gendisk info */
1889 rc = -ENOMEM;
1890 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1891 if (!disk)
1892 goto out;
1893
Alex Elderf0f8cef2012-01-29 13:57:44 -06001894 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001895 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896 disk->major = rbd_dev->major;
1897 disk->first_minor = 0;
1898 disk->fops = &rbd_bd_ops;
1899 disk->private_data = rbd_dev;
1900
1901 /* init rq */
1902 rc = -ENOMEM;
1903 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1904 if (!q)
1905 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001906
Alex Elder593a9e72012-02-07 12:03:37 -06001907 /* We use the default size, but let's be explicit about it. */
1908 blk_queue_physical_block_size(q, SECTOR_SIZE);
1909
Josh Durgin029bcbd2011-07-22 11:35:23 -07001910 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001911 segment_size = rbd_obj_bytes(&rbd_dev->header);
1912 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1913 blk_queue_max_segment_size(q, segment_size);
1914 blk_queue_io_min(q, segment_size);
1915 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001916
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001917 blk_queue_merge_bvec(q, rbd_merge_bvec);
1918 disk->queue = q;
1919
1920 q->queuedata = rbd_dev;
1921
1922 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923
1924 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001925 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926 add_disk(disk);
1927
1928 pr_info("%s: added with size 0x%llx\n",
1929 disk->disk_name, (unsigned long long)total_size);
1930 return 0;
1931
1932out_disk:
1933 put_disk(disk);
1934out:
1935 return rc;
1936}
1937
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001938/*
1939 sysfs
1940*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001941
Alex Elder593a9e72012-02-07 12:03:37 -06001942static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1943{
1944 return container_of(dev, struct rbd_device, dev);
1945}
1946
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947static ssize_t rbd_size_show(struct device *dev,
1948 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001949{
Alex Elder593a9e72012-02-07 12:03:37 -06001950 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001951 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952
Josh Durgina51aa0c2011-12-05 10:35:04 -08001953 down_read(&rbd_dev->header_rwsem);
1954 size = get_capacity(rbd_dev->disk);
1955 up_read(&rbd_dev->header_rwsem);
1956
1957 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001958}
1959
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960static ssize_t rbd_major_show(struct device *dev,
1961 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001962{
Alex Elder593a9e72012-02-07 12:03:37 -06001963 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964
1965 return sprintf(buf, "%d\n", rbd_dev->major);
1966}
1967
1968static ssize_t rbd_client_id_show(struct device *dev,
1969 struct device_attribute *attr, char *buf)
1970{
Alex Elder593a9e72012-02-07 12:03:37 -06001971 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972
Alex Elder1dbb4392012-01-24 10:08:37 -06001973 return sprintf(buf, "client%lld\n",
1974 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975}
1976
1977static ssize_t rbd_pool_show(struct device *dev,
1978 struct device_attribute *attr, char *buf)
1979{
Alex Elder593a9e72012-02-07 12:03:37 -06001980 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981
1982 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1983}
1984
Alex Elder9bb2f332012-07-12 10:46:35 -05001985static ssize_t rbd_pool_id_show(struct device *dev,
1986 struct device_attribute *attr, char *buf)
1987{
1988 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1989
1990 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1991}
1992
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001993static ssize_t rbd_name_show(struct device *dev,
1994 struct device_attribute *attr, char *buf)
1995{
Alex Elder593a9e72012-02-07 12:03:37 -06001996 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997
Alex Elder0bed54d2012-07-03 16:01:18 -05001998 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999}
2000
2001static ssize_t rbd_snap_show(struct device *dev,
2002 struct device_attribute *attr,
2003 char *buf)
2004{
Alex Elder593a9e72012-02-07 12:03:37 -06002005 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002006
2007 return sprintf(buf, "%s\n", rbd_dev->snap_name);
2008}
2009
2010static ssize_t rbd_image_refresh(struct device *dev,
2011 struct device_attribute *attr,
2012 const char *buf,
2013 size_t size)
2014{
Alex Elder593a9e72012-02-07 12:03:37 -06002015 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002016 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002017
Alex Elder1fe5e992012-07-25 09:32:41 -05002018 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002019
2020 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002021}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002022
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002023static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2024static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2025static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2026static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002027static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2029static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2030static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2031static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002032
2033static struct attribute *rbd_attrs[] = {
2034 &dev_attr_size.attr,
2035 &dev_attr_major.attr,
2036 &dev_attr_client_id.attr,
2037 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002038 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002039 &dev_attr_name.attr,
2040 &dev_attr_current_snap.attr,
2041 &dev_attr_refresh.attr,
2042 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002043 NULL
2044};
2045
2046static struct attribute_group rbd_attr_group = {
2047 .attrs = rbd_attrs,
2048};
2049
2050static const struct attribute_group *rbd_attr_groups[] = {
2051 &rbd_attr_group,
2052 NULL
2053};
2054
2055static void rbd_sysfs_dev_release(struct device *dev)
2056{
2057}
2058
2059static struct device_type rbd_device_type = {
2060 .name = "rbd",
2061 .groups = rbd_attr_groups,
2062 .release = rbd_sysfs_dev_release,
2063};
2064
2065
2066/*
2067 sysfs - snapshots
2068*/
2069
2070static ssize_t rbd_snap_size_show(struct device *dev,
2071 struct device_attribute *attr,
2072 char *buf)
2073{
2074 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2075
Josh Durgin35915382011-12-05 18:25:13 -08002076 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077}
2078
2079static ssize_t rbd_snap_id_show(struct device *dev,
2080 struct device_attribute *attr,
2081 char *buf)
2082{
2083 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084
Josh Durgin35915382011-12-05 18:25:13 -08002085 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086}
2087
2088static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2089static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2090
2091static struct attribute *rbd_snap_attrs[] = {
2092 &dev_attr_snap_size.attr,
2093 &dev_attr_snap_id.attr,
2094 NULL,
2095};
2096
2097static struct attribute_group rbd_snap_attr_group = {
2098 .attrs = rbd_snap_attrs,
2099};
2100
2101static void rbd_snap_dev_release(struct device *dev)
2102{
2103 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2104 kfree(snap->name);
2105 kfree(snap);
2106}
2107
2108static const struct attribute_group *rbd_snap_attr_groups[] = {
2109 &rbd_snap_attr_group,
2110 NULL
2111};
2112
2113static struct device_type rbd_snap_device_type = {
2114 .groups = rbd_snap_attr_groups,
2115 .release = rbd_snap_dev_release,
2116};
2117
Alex Elder14e70852012-07-19 09:09:27 -05002118static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119{
2120 list_del(&snap->node);
2121 device_unregister(&snap->dev);
2122}
2123
Alex Elder14e70852012-07-19 09:09:27 -05002124static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125 struct device *parent)
2126{
2127 struct device *dev = &snap->dev;
2128 int ret;
2129
2130 dev->type = &rbd_snap_device_type;
2131 dev->parent = parent;
2132 dev->release = rbd_snap_dev_release;
2133 dev_set_name(dev, "snap_%s", snap->name);
2134 ret = device_register(dev);
2135
2136 return ret;
2137}
2138
Alex Elder4e891e02012-07-10 20:30:10 -05002139static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2140 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141{
Alex Elder4e891e02012-07-10 20:30:10 -05002142 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002144
2145 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002146 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002147 return ERR_PTR(-ENOMEM);
2148
2149 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002150 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002151 if (!snap->name)
2152 goto err;
2153
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002154 snap->size = rbd_dev->header.snap_sizes[i];
2155 snap->id = rbd_dev->header.snapc->snaps[i];
2156 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002157 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002158 if (ret < 0)
2159 goto err;
2160 }
Alex Elder4e891e02012-07-10 20:30:10 -05002161
2162 return snap;
2163
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164err:
2165 kfree(snap->name);
2166 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002167
2168 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002169}
2170
2171/*
Alex Elder35938152012-08-02 11:29:46 -05002172 * Scan the rbd device's current snapshot list and compare it to the
2173 * newly-received snapshot context. Remove any existing snapshots
2174 * not present in the new snapshot context. Add a new snapshot for
2175 * any snaphots in the snapshot context not in the current list.
2176 * And verify there are no changes to snapshots we already know
2177 * about.
2178 *
2179 * Assumes the snapshots in the snapshot context are sorted by
2180 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2181 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182 */
Alex Elder9fcbb802012-08-23 23:48:49 -05002183static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184{
Alex Elder35938152012-08-02 11:29:46 -05002185 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2186 const u32 snap_count = snapc->num_snaps;
2187 char *snap_name = rbd_dev->header.snap_names;
2188 struct list_head *head = &rbd_dev->snaps;
2189 struct list_head *links = head->next;
2190 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002191
Alex Elder9fcbb802012-08-23 23:48:49 -05002192 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002193 while (index < snap_count || links != head) {
2194 u64 snap_id;
2195 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002196
Alex Elder35938152012-08-02 11:29:46 -05002197 snap_id = index < snap_count ? snapc->snaps[index]
2198 : CEPH_NOSNAP;
2199 snap = links != head ? list_entry(links, struct rbd_snap, node)
2200 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002201 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202
Alex Elder35938152012-08-02 11:29:46 -05002203 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2204 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002205
Alex Elder35938152012-08-02 11:29:46 -05002206 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002207
Alex Elder35938152012-08-02 11:29:46 -05002208 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002209 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002210 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002211 dout("%ssnap id %llu has been removed\n",
2212 rbd_dev->snap_id == snap->id ? "mapped " : "",
2213 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002214
Alex Elder35938152012-08-02 11:29:46 -05002215 /* Done with this list entry; advance */
2216
2217 links = next;
2218 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002219 }
Alex Elder35938152012-08-02 11:29:46 -05002220
Alex Elder9fcbb802012-08-23 23:48:49 -05002221 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2222 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002223 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2224 struct rbd_snap *new_snap;
2225
2226 /* We haven't seen this snapshot before */
2227
2228 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2229 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002230 if (IS_ERR(new_snap)) {
2231 int err = PTR_ERR(new_snap);
2232
2233 dout(" failed to add dev, error %d\n", err);
2234
2235 return err;
2236 }
Alex Elder35938152012-08-02 11:29:46 -05002237
2238 /* New goes before existing, or at end of list */
2239
Alex Elder9fcbb802012-08-23 23:48:49 -05002240 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002241 if (snap)
2242 list_add_tail(&new_snap->node, &snap->node);
2243 else
Alex Elder523f3252012-08-30 00:16:37 -05002244 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002245 } else {
2246 /* Already have this one */
2247
Alex Elder9fcbb802012-08-23 23:48:49 -05002248 dout(" already present\n");
2249
Alex Elderaafb2302012-09-06 16:00:54 -05002250 rbd_assert(snap->size ==
2251 rbd_dev->header.snap_sizes[index]);
2252 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002253
2254 /* Done with this list entry; advance */
2255
2256 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002257 }
Alex Elder35938152012-08-02 11:29:46 -05002258
2259 /* Advance to the next entry in the snapshot context */
2260
2261 index++;
2262 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002263 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002264 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002265
2266 return 0;
2267}
2268
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002269static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2270{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002271 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002272 struct device *dev;
2273 struct rbd_snap *snap;
2274
2275 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2276 dev = &rbd_dev->dev;
2277
2278 dev->bus = &rbd_bus_type;
2279 dev->type = &rbd_device_type;
2280 dev->parent = &rbd_root_dev;
2281 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002282 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002283 ret = device_register(dev);
2284 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002285 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002286
2287 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002288 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002289 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002290 break;
2291 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002292out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002293 mutex_unlock(&ctl_mutex);
2294 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002295}
2296
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002297static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2298{
2299 device_unregister(&rbd_dev->dev);
2300}
2301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002302static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2303{
2304 int ret, rc;
2305
2306 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002307 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002308 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002309 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002310 if (rc < 0)
2311 return rc;
2312 }
2313 } while (ret == -ERANGE);
2314
2315 return ret;
2316}
2317
Alex Eldere2839302012-08-29 17:11:06 -05002318static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002319
2320/*
Alex Elder499afd52012-02-02 08:13:29 -06002321 * Get a unique rbd identifier for the given new rbd_dev, and add
2322 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002323 */
Alex Eldere2839302012-08-29 17:11:06 -05002324static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002325{
Alex Eldere2839302012-08-29 17:11:06 -05002326 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002327
2328 spin_lock(&rbd_dev_list_lock);
2329 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2330 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002331 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2332 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002333}
Alex Elderb7f23c32012-01-29 13:57:43 -06002334
Alex Elder1ddbe942012-01-29 13:57:44 -06002335/*
Alex Elder499afd52012-02-02 08:13:29 -06002336 * Remove an rbd_dev from the global list, and record that its
2337 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002338 */
Alex Eldere2839302012-08-29 17:11:06 -05002339static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002340{
Alex Elderd184f6b2012-01-29 13:57:44 -06002341 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002342 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002343 int max_id;
2344
Alex Elderaafb2302012-09-06 16:00:54 -05002345 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002346
Alex Eldere2839302012-08-29 17:11:06 -05002347 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2348 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002349 spin_lock(&rbd_dev_list_lock);
2350 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002351
2352 /*
2353 * If the id being "put" is not the current maximum, there
2354 * is nothing special we need to do.
2355 */
Alex Eldere2839302012-08-29 17:11:06 -05002356 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002357 spin_unlock(&rbd_dev_list_lock);
2358 return;
2359 }
2360
2361 /*
2362 * We need to update the current maximum id. Search the
2363 * list to find out what it is. We're more likely to find
2364 * the maximum at the end, so search the list backward.
2365 */
2366 max_id = 0;
2367 list_for_each_prev(tmp, &rbd_dev_list) {
2368 struct rbd_device *rbd_dev;
2369
2370 rbd_dev = list_entry(tmp, struct rbd_device, node);
2371 if (rbd_id > max_id)
2372 max_id = rbd_id;
2373 }
Alex Elder499afd52012-02-02 08:13:29 -06002374 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002375
Alex Elder1ddbe942012-01-29 13:57:44 -06002376 /*
Alex Eldere2839302012-08-29 17:11:06 -05002377 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002378 * which case it now accurately reflects the new maximum.
2379 * Be careful not to overwrite the maximum value in that
2380 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002381 */
Alex Eldere2839302012-08-29 17:11:06 -05002382 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2383 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002384}
2385
Alex Eldera725f65e2012-02-02 08:13:30 -06002386/*
Alex Eldere28fff262012-02-02 08:13:30 -06002387 * Skips over white space at *buf, and updates *buf to point to the
2388 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002389 * the token (string of non-white space characters) found. Note
2390 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002391 */
2392static inline size_t next_token(const char **buf)
2393{
2394 /*
2395 * These are the characters that produce nonzero for
2396 * isspace() in the "C" and "POSIX" locales.
2397 */
2398 const char *spaces = " \f\n\r\t\v";
2399
2400 *buf += strspn(*buf, spaces); /* Find start of token */
2401
2402 return strcspn(*buf, spaces); /* Return token length */
2403}
2404
2405/*
2406 * Finds the next token in *buf, and if the provided token buffer is
2407 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002408 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2409 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002410 *
2411 * Returns the length of the token found (not including the '\0').
2412 * Return value will be 0 if no token is found, and it will be >=
2413 * token_size if the token would not fit.
2414 *
Alex Elder593a9e72012-02-07 12:03:37 -06002415 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002416 * found token. Note that this occurs even if the token buffer is
2417 * too small to hold it.
2418 */
2419static inline size_t copy_token(const char **buf,
2420 char *token,
2421 size_t token_size)
2422{
2423 size_t len;
2424
2425 len = next_token(buf);
2426 if (len < token_size) {
2427 memcpy(token, *buf, len);
2428 *(token + len) = '\0';
2429 }
2430 *buf += len;
2431
2432 return len;
2433}
2434
2435/*
Alex Elderea3352f2012-07-09 21:04:23 -05002436 * Finds the next token in *buf, dynamically allocates a buffer big
2437 * enough to hold a copy of it, and copies the token into the new
2438 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2439 * that a duplicate buffer is created even for a zero-length token.
2440 *
2441 * Returns a pointer to the newly-allocated duplicate, or a null
2442 * pointer if memory for the duplicate was not available. If
2443 * the lenp argument is a non-null pointer, the length of the token
2444 * (not including the '\0') is returned in *lenp.
2445 *
2446 * If successful, the *buf pointer will be updated to point beyond
2447 * the end of the found token.
2448 *
2449 * Note: uses GFP_KERNEL for allocation.
2450 */
2451static inline char *dup_token(const char **buf, size_t *lenp)
2452{
2453 char *dup;
2454 size_t len;
2455
2456 len = next_token(buf);
2457 dup = kmalloc(len + 1, GFP_KERNEL);
2458 if (!dup)
2459 return NULL;
2460
2461 memcpy(dup, *buf, len);
2462 *(dup + len) = '\0';
2463 *buf += len;
2464
2465 if (lenp)
2466 *lenp = len;
2467
2468 return dup;
2469}
2470
2471/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002472 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002473 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2474 * on the list of monitor addresses and other options provided via
2475 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002476 *
2477 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002478 */
2479static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2480 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002481 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002482 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002483 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002484 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002485{
Alex Elderd22f76e2012-07-12 10:46:35 -05002486 size_t len;
2487 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002488
2489 /* The first four tokens are required */
2490
Alex Elder7ef32142012-02-02 08:13:30 -06002491 len = next_token(&buf);
2492 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002493 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002494 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002495 *mon_addrs = buf;
2496
2497 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002498
Alex Eldere28fff262012-02-02 08:13:30 -06002499 len = copy_token(&buf, options, options_size);
2500 if (!len || len >= options_size)
2501 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002502
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002503 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002504 rbd_dev->pool_name = dup_token(&buf, NULL);
2505 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002506 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002507
Alex Elder0bed54d2012-07-03 16:01:18 -05002508 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2509 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002510 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002511
Alex Eldercb8627c2012-07-09 21:04:23 -05002512 /* Create the name of the header object */
2513
Alex Elder0bed54d2012-07-03 16:01:18 -05002514 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002515 + sizeof (RBD_SUFFIX),
2516 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002517 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002518 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002519 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002520
Alex Eldere28fff262012-02-02 08:13:30 -06002521 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002522 * The snapshot name is optional. If none is is supplied,
2523 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002524 */
Alex Elder820a5f32012-07-09 21:04:24 -05002525 rbd_dev->snap_name = dup_token(&buf, &len);
2526 if (!rbd_dev->snap_name)
2527 goto out_err;
2528 if (!len) {
2529 /* Replace the empty name with the default */
2530 kfree(rbd_dev->snap_name);
2531 rbd_dev->snap_name
2532 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2533 if (!rbd_dev->snap_name)
2534 goto out_err;
2535
Alex Eldere28fff262012-02-02 08:13:30 -06002536 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2537 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002538 }
Alex Eldere28fff262012-02-02 08:13:30 -06002539
Alex Eldera725f65e2012-02-02 08:13:30 -06002540 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002541
2542out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002543 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002544 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002545 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002546 rbd_dev->image_name = NULL;
2547 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002548 kfree(rbd_dev->pool_name);
2549 rbd_dev->pool_name = NULL;
2550
2551 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002552}
2553
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002554static ssize_t rbd_add(struct bus_type *bus,
2555 const char *buf,
2556 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557{
Alex Eldercb8627c2012-07-09 21:04:23 -05002558 char *options;
2559 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002560 const char *mon_addrs = NULL;
2561 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002562 struct ceph_osd_client *osdc;
2563 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564
2565 if (!try_module_get(THIS_MODULE))
2566 return -ENODEV;
2567
Alex Elder27cc2592012-02-02 08:13:30 -06002568 options = kmalloc(count, GFP_KERNEL);
2569 if (!options)
2570 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002571 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2572 if (!rbd_dev)
2573 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574
2575 /* static rbd_device initialization */
2576 spin_lock_init(&rbd_dev->lock);
2577 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002578 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002579 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002580
Alex Elderd184f6b2012-01-29 13:57:44 -06002581 /* generate unique id: find highest unique id, add one */
Alex Eldere2839302012-08-29 17:11:06 -05002582 rbd_dev_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583
Alex Eldera725f65e2012-02-02 08:13:30 -06002584 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002585 BUILD_BUG_ON(DEV_NAME_LEN
2586 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002587 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002588
Alex Eldera725f65e2012-02-02 08:13:30 -06002589 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002590 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002591 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002592 if (rc)
2593 goto err_put_id;
2594
Alex Elderf8c38922012-08-10 13:12:07 -07002595 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2596 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002597 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002598
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002599 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002600 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002601 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2602 if (rc < 0)
2603 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002604 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002605
2606 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002607 rc = register_blkdev(0, rbd_dev->name);
2608 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002609 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002610 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002611
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002612 rc = rbd_bus_add_dev(rbd_dev);
2613 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002614 goto err_out_blkdev;
2615
Alex Elder32eec682012-02-08 16:11:14 -06002616 /*
2617 * At this point cleanup in the event of an error is the job
2618 * of the sysfs code (initiated by rbd_bus_del_dev()).
2619 *
2620 * Set up and announce blkdev mapping.
2621 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002622 rc = rbd_init_disk(rbd_dev);
2623 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002624 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002625
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002626 rc = rbd_init_watch_dev(rbd_dev);
2627 if (rc)
2628 goto err_out_bus;
2629
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002630 return count;
2631
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002632err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002633 /* this will also clean up rest of rbd_dev stuff */
2634
2635 rbd_bus_del_dev(rbd_dev);
2636 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002637 return rc;
2638
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002639err_out_blkdev:
2640 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2641err_out_client:
2642 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002643err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002644 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002645 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002646 kfree(rbd_dev->header_name);
2647 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002648 kfree(rbd_dev->pool_name);
2649 }
Alex Eldere2839302012-08-29 17:11:06 -05002650 rbd_dev_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002651err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002652 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002653 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002654
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655 dout("Error adding device %s\n", buf);
2656 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002657
2658 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002659}
2660
Alex Elderde71a292012-07-03 16:01:19 -05002661static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662{
2663 struct list_head *tmp;
2664 struct rbd_device *rbd_dev;
2665
Alex Eldere124a822012-01-29 13:57:44 -06002666 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002667 list_for_each(tmp, &rbd_dev_list) {
2668 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002669 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002670 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002672 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002673 }
Alex Eldere124a822012-01-29 13:57:44 -06002674 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675 return NULL;
2676}
2677
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002678static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002679{
Alex Elder593a9e72012-02-07 12:03:37 -06002680 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681
Alex Elder1dbb4392012-01-24 10:08:37 -06002682 if (rbd_dev->watch_request) {
2683 struct ceph_client *client = rbd_dev->rbd_client->client;
2684
2685 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002686 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002687 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002688 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002689 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002690
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691 rbd_put_client(rbd_dev);
2692
2693 /* clean up and free blkdev */
2694 rbd_free_disk(rbd_dev);
2695 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002696
2697 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002698 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002699 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002700 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002701 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002702 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703 kfree(rbd_dev);
2704
2705 /* release module ref */
2706 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002707}
2708
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002709static ssize_t rbd_remove(struct bus_type *bus,
2710 const char *buf,
2711 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002712{
2713 struct rbd_device *rbd_dev = NULL;
2714 int target_id, rc;
2715 unsigned long ul;
2716 int ret = count;
2717
2718 rc = strict_strtoul(buf, 10, &ul);
2719 if (rc)
2720 return rc;
2721
2722 /* convert to int; abort if we lost anything in the conversion */
2723 target_id = (int) ul;
2724 if (target_id != ul)
2725 return -EINVAL;
2726
2727 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2728
2729 rbd_dev = __rbd_get_dev(target_id);
2730 if (!rbd_dev) {
2731 ret = -ENOENT;
2732 goto done;
2733 }
2734
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002735 __rbd_remove_all_snaps(rbd_dev);
2736 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002737
2738done:
2739 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05002740
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002741 return ret;
2742}
2743
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002744static ssize_t rbd_snap_add(struct device *dev,
2745 struct device_attribute *attr,
2746 const char *buf,
2747 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002748{
Alex Elder593a9e72012-02-07 12:03:37 -06002749 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002750 int ret;
2751 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002752 if (!name)
2753 return -ENOMEM;
2754
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002755 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002756
2757 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2758
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002759 ret = rbd_header_add_snap(rbd_dev,
2760 name, GFP_KERNEL);
2761 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002762 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002763
Alex Elderb8136232012-07-25 09:32:41 -05002764 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002765 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002766 goto err_unlock;
2767
2768 /* shouldn't hold ctl_mutex when notifying.. notify might
2769 trigger a watch callback that would need to get that mutex */
2770 mutex_unlock(&ctl_mutex);
2771
2772 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002773 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002774
2775 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002776 kfree(name);
2777 return ret;
2778
2779err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002780 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002781 kfree(name);
2782 return ret;
2783}
2784
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002785/*
2786 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002787 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002788 */
2789static int rbd_sysfs_init(void)
2790{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002791 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002792
Alex Elderfed4c142012-02-07 12:03:36 -06002793 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002794 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002795 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002796
Alex Elderfed4c142012-02-07 12:03:36 -06002797 ret = bus_register(&rbd_bus_type);
2798 if (ret < 0)
2799 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002800
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002801 return ret;
2802}
2803
2804static void rbd_sysfs_cleanup(void)
2805{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002806 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002807 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002808}
2809
2810int __init rbd_init(void)
2811{
2812 int rc;
2813
2814 rc = rbd_sysfs_init();
2815 if (rc)
2816 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002817 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002818 return 0;
2819}
2820
2821void __exit rbd_exit(void)
2822{
2823 rbd_sysfs_cleanup();
2824}
2825
2826module_init(rbd_init);
2827module_exit(rbd_exit);
2828
2829MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2830MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2831MODULE_DESCRIPTION("rados block device");
2832
2833/* following authorship retained from original osdblk.c */
2834MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2835
2836MODULE_LICENSE("GPL");