blob: 6da6990a7b57148b156edd7b68deb3a9bcbfb512 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderdf111be2012-08-09 10:33:26 -070053/* It might be useful to have this defined elsewhere too */
54
55#define U64_MAX ((u64) (~0ULL))
56
Alex Elderf0f8cef2012-01-29 13:57:44 -060057#define RBD_DRV_NAME "rbd"
58#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070059
60#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
61
Yehuda Sadeh602adf42010-08-12 16:11:25 -070062#define RBD_MAX_SNAP_NAME_LEN 32
63#define RBD_MAX_OPT_LEN 1024
64
65#define RBD_SNAP_HEAD_NAME "-"
66
Alex Elder81a89792012-02-02 08:13:30 -060067/*
68 * An RBD device name will be "rbd#", where the "rbd" comes from
69 * RBD_DRV_NAME above, and # is a unique integer identifier.
70 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
71 * enough to hold all possible device names.
72 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070073#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060074#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075
Alex Eldercc0538b2012-08-10 13:12:07 -070076#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070077
Yehuda Sadeh602adf42010-08-12 16:11:25 -070078/*
79 * block device image metadata (in-memory version)
80 */
81struct rbd_image_header {
82 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050083 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070084 __u8 obj_order;
85 __u8 crypt_type;
86 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070087 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088 u32 total_snaps;
89
90 char *snap_names;
91 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070092
93 u64 obj_version;
94};
95
96struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -070097 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098};
99
100/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600101 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 */
103struct rbd_client {
104 struct ceph_client *client;
105 struct kref kref;
106 struct list_head node;
107};
108
109/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600110 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700111 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700112struct rbd_req_status {
113 int done;
114 int rc;
115 u64 bytes;
116};
117
118/*
119 * a collection of requests
120 */
121struct rbd_req_coll {
122 int total;
123 int num_done;
124 struct kref kref;
125 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700126};
127
Alex Elderf0f8cef2012-01-29 13:57:44 -0600128/*
129 * a single io request
130 */
131struct rbd_request {
132 struct request *rq; /* blk layer request */
133 struct bio *bio; /* cloned bio */
134 struct page **pages; /* list of used pages */
135 u64 len;
136 int coll_index;
137 struct rbd_req_coll *coll;
138};
139
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800140struct rbd_snap {
141 struct device dev;
142 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800143 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800144 struct list_head node;
145 u64 id;
146};
147
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148/*
149 * a single device
150 */
151struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500152 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700153
154 int major; /* blkdev assigned major */
155 struct gendisk *disk; /* blkdev's gendisk and rq */
156 struct request_queue *q;
157
Alex Elderf8c38922012-08-10 13:12:07 -0700158 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700159 struct rbd_client *rbd_client;
160
161 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
162
163 spinlock_t lock; /* queue lock */
164
165 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500166 char *image_name;
167 size_t image_name_len;
168 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500169 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500170 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700171
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700172 struct ceph_osd_event *watch_event;
173 struct ceph_osd_request *watch_request;
174
Josh Durginc6666012011-11-21 17:11:12 -0800175 /* protects updating the header */
176 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500178 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800179 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800180 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800181 /* whether the snap_id this device reads from still exists */
182 bool snap_exists;
Alex Eldercc0538b2012-08-10 13:12:07 -0700183 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184
185 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800186
187 /* list of snapshots */
188 struct list_head snaps;
189
190 /* sysfs related */
191 struct device dev;
192};
193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600195
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700196static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600197static DEFINE_SPINLOCK(rbd_dev_list_lock);
198
Alex Elder432b8582012-01-29 13:57:44 -0600199static LIST_HEAD(rbd_client_list); /* clients */
200static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800202static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
203static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800204static ssize_t rbd_snap_add(struct device *dev,
205 struct device_attribute *attr,
206 const char *buf,
207 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500208static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800209
Alex Elderf0f8cef2012-01-29 13:57:44 -0600210static ssize_t rbd_add(struct bus_type *bus, const char *buf,
211 size_t count);
212static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
213 size_t count);
214
215static struct bus_attribute rbd_bus_attrs[] = {
216 __ATTR(add, S_IWUSR, NULL, rbd_add),
217 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
218 __ATTR_NULL
219};
220
221static struct bus_type rbd_bus_type = {
222 .name = "rbd",
223 .bus_attrs = rbd_bus_attrs,
224};
225
226static void rbd_root_dev_release(struct device *dev)
227{
228}
229
230static struct device rbd_root_dev = {
231 .init_name = "rbd",
232 .release = rbd_root_dev_release,
233};
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800236static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
237{
238 return get_device(&rbd_dev->dev);
239}
240
241static void rbd_put_dev(struct rbd_device *rbd_dev)
242{
243 put_device(&rbd_dev->dev);
244}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245
Alex Elder1fe5e992012-07-25 09:32:41 -0500246static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700247
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248static int rbd_open(struct block_device *bdev, fmode_t mode)
249{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600250 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700251
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700252 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
253 return -EROFS;
254
Alex Elder340c7a22012-08-10 13:12:07 -0700255 rbd_get_dev(rbd_dev);
256 set_device_ro(bdev, rbd_dev->read_only);
257
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258 return 0;
259}
260
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800261static int rbd_release(struct gendisk *disk, fmode_t mode)
262{
263 struct rbd_device *rbd_dev = disk->private_data;
264
265 rbd_put_dev(rbd_dev);
266
267 return 0;
268}
269
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270static const struct block_device_operations rbd_bd_ops = {
271 .owner = THIS_MODULE,
272 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800273 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274};
275
276/*
277 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500278 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279 */
Alex Elderf8c38922012-08-10 13:12:07 -0700280static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281{
282 struct rbd_client *rbdc;
283 int ret = -ENOMEM;
284
285 dout("rbd_client_create\n");
286 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
287 if (!rbdc)
288 goto out_opt;
289
290 kref_init(&rbdc->kref);
291 INIT_LIST_HEAD(&rbdc->node);
292
Alex Elderbc534d862012-01-29 13:57:44 -0600293 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
294
Alex Elder43ae4702012-07-03 16:01:18 -0500295 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600297 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500298 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
300 ret = ceph_open_session(rbdc->client);
301 if (ret < 0)
302 goto out_err;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d862012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700325 * Find a ceph client with specific addr and configuration. If
326 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700328static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329{
330 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700331 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332
Alex Elder43ae4702012-07-03 16:01:18 -0500333 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 return NULL;
335
Alex Elder1f7ba332012-08-10 13:12:07 -0700336 spin_lock(&rbd_client_list_lock);
337 list_for_each_entry(client_node, &rbd_client_list, node) {
338 if (!ceph_compare_options(ceph_opts, client_node->client)) {
339 kref_get(&client_node->kref);
340 found = true;
341 break;
342 }
343 }
344 spin_unlock(&rbd_client_list_lock);
345
346 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347}
348
349/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700350 * mount options
351 */
352enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700353 Opt_last_int,
354 /* int args above */
355 Opt_last_string,
356 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700357 Opt_read_only,
358 Opt_read_write,
359 /* Boolean args above */
360 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361};
362
Alex Elder43ae4702012-07-03 16:01:18 -0500363static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700364 /* int args above */
365 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700366 {Opt_read_only, "read_only"},
367 {Opt_read_only, "ro"}, /* Alternate spelling */
368 {Opt_read_write, "read_write"},
369 {Opt_read_write, "rw"}, /* Alternate spelling */
370 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700371 {-1, NULL}
372};
373
374static int parse_rbd_opts_token(char *c, void *private)
375{
Alex Elder43ae4702012-07-03 16:01:18 -0500376 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377 substring_t argstr[MAX_OPT_ARGS];
378 int token, intval, ret;
379
Alex Elder43ae4702012-07-03 16:01:18 -0500380 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 if (token < 0)
382 return -EINVAL;
383
384 if (token < Opt_last_int) {
385 ret = match_int(&argstr[0], &intval);
386 if (ret < 0) {
387 pr_err("bad mount option arg (not int) "
388 "at '%s'\n", c);
389 return ret;
390 }
391 dout("got int token %d val %d\n", token, intval);
392 } else if (token > Opt_last_int && token < Opt_last_string) {
393 dout("got string token %d val %s\n", token,
394 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700395 } else if (token > Opt_last_string && token < Opt_last_bool) {
396 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397 } else {
398 dout("got token %d\n", token);
399 }
400
401 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700402 case Opt_read_only:
403 rbd_opts->read_only = true;
404 break;
405 case Opt_read_write:
406 rbd_opts->read_only = false;
407 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 default:
409 BUG_ON(token);
410 }
411 return 0;
412}
413
414/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 * Get a ceph client with specific addr and configuration, if one does
416 * not exist create it.
417 */
Alex Elderf8c38922012-08-10 13:12:07 -0700418static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
419 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420{
Alex Elderf8c38922012-08-10 13:12:07 -0700421 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500422 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700423 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700424
Alex Eldercc0538b2012-08-10 13:12:07 -0700425 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426
Alex Elder43ae4702012-07-03 16:01:18 -0500427 ceph_opts = ceph_parse_options(options, mon_addr,
428 mon_addr + mon_addr_len,
429 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700430 if (IS_ERR(ceph_opts))
431 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elder1f7ba332012-08-10 13:12:07 -0700433 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600435 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500436 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700437 } else {
438 rbdc = rbd_client_create(ceph_opts);
439 if (IS_ERR(rbdc))
440 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 }
Alex Elderf8c38922012-08-10 13:12:07 -0700442 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443
Alex Elderf8c38922012-08-10 13:12:07 -0700444 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445}
446
447/*
448 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600449 *
Alex Elder432b8582012-01-29 13:57:44 -0600450 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451 */
452static void rbd_client_release(struct kref *kref)
453{
454 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
455
456 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500457 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500459 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460
461 ceph_destroy_client(rbdc->client);
462 kfree(rbdc);
463}
464
465/*
466 * Drop reference to ceph client node. If it's not referenced anymore, release
467 * it.
468 */
469static void rbd_put_client(struct rbd_device *rbd_dev)
470{
471 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
472 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473}
474
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700475/*
476 * Destroy requests collection
477 */
478static void rbd_coll_release(struct kref *kref)
479{
480 struct rbd_req_coll *coll =
481 container_of(kref, struct rbd_req_coll, kref);
482
483 dout("rbd_coll_release %p\n", coll);
484 kfree(coll);
485}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486
Alex Elder8e94af82012-07-25 09:32:40 -0500487static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
488{
Alex Elder103a1502012-08-02 11:29:45 -0500489 size_t size;
490 u32 snap_count;
491
492 /* The header has to start with the magic rbd header text */
493 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
494 return false;
495
496 /*
497 * The size of a snapshot header has to fit in a size_t, and
498 * that limits the number of snapshots.
499 */
500 snap_count = le32_to_cpu(ondisk->snap_count);
501 size = SIZE_MAX - sizeof (struct ceph_snap_context);
502 if (snap_count > size / sizeof (__le64))
503 return false;
504
505 /*
506 * Not only that, but the size of the entire the snapshot
507 * header must also be representable in a size_t.
508 */
509 size -= snap_count * sizeof (__le64);
510 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
511 return false;
512
513 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500514}
515
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516/*
517 * Create a new header structure, translate header format from the on-disk
518 * header.
519 */
520static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500521 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700522{
Alex Elderccece232012-07-10 20:30:10 -0500523 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500524 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500525 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500526 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700527
Alex Elder6a523252012-07-19 17:12:59 -0500528 memset(header, 0, sizeof (*header));
529
Alex Elder103a1502012-08-02 11:29:45 -0500530 snap_count = le32_to_cpu(ondisk->snap_count);
531
Alex Elder58c17b02012-08-23 23:22:06 -0500532 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
533 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500534 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500536 memcpy(header->object_prefix, ondisk->object_prefix, len);
537 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600538
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500540 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
541
Alex Elder621901d2012-08-23 23:22:06 -0500542 /* Save a copy of the snapshot names */
543
Alex Elderf785cc12012-08-23 23:22:06 -0500544 if (snap_names_len > (u64) SIZE_MAX)
545 return -EIO;
546 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500548 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500549 /*
550 * Note that rbd_dev_v1_header_read() guarantees
551 * the ondisk buffer we're working with has
552 * snap_names_len bytes beyond the end of the
553 * snapshot id array, this memcpy() is safe.
554 */
555 memcpy(header->snap_names, &ondisk->snaps[snap_count],
556 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500557
Alex Elder621901d2012-08-23 23:22:06 -0500558 /* Record each snapshot's size */
559
Alex Elderd2bb24e2012-07-26 23:37:14 -0500560 size = snap_count * sizeof (*header->snap_sizes);
561 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500563 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500564 for (i = 0; i < snap_count; i++)
565 header->snap_sizes[i] =
566 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567 } else {
Alex Elderccece232012-07-10 20:30:10 -0500568 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569 header->snap_names = NULL;
570 header->snap_sizes = NULL;
571 }
Alex Elder849b4262012-07-09 21:04:24 -0500572
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700573 header->image_size = le64_to_cpu(ondisk->image_size);
574 header->obj_order = ondisk->options.order;
575 header->crypt_type = ondisk->options.crypt_type;
576 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500577 header->total_snaps = snap_count;
578
Alex Elder621901d2012-08-23 23:22:06 -0500579 /* Allocate and fill in the snapshot context */
580
Alex Elder6a523252012-07-19 17:12:59 -0500581 size = sizeof (struct ceph_snap_context);
582 size += snap_count * sizeof (header->snapc->snaps[0]);
583 header->snapc = kzalloc(size, GFP_KERNEL);
584 if (!header->snapc)
585 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586
587 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500588 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700589 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500590 for (i = 0; i < snap_count; i++)
591 header->snapc->snaps[i] =
592 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593
594 return 0;
595
Alex Elder6a523252012-07-19 17:12:59 -0500596out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500597 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500598 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500600 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500601 kfree(header->object_prefix);
602 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500603
Alex Elder00f1f362012-02-07 12:03:36 -0600604 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605}
606
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
608 u64 *seq, u64 *size)
609{
610 int i;
611 char *p = header->snap_names;
612
Alex Elder00f1f362012-02-07 12:03:36 -0600613 for (i = 0; i < header->total_snaps; i++) {
614 if (!strcmp(snap_name, p)) {
615
616 /* Found it. Pass back its id and/or size */
617
618 if (seq)
619 *seq = header->snapc->snaps[i];
620 if (size)
621 *size = header->snap_sizes[i];
622 return i;
623 }
624 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625 }
Alex Elder00f1f362012-02-07 12:03:36 -0600626 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700627}
628
Alex Elder0ce1a792012-07-03 16:01:18 -0500629static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630{
Alex Elder78dc4472012-07-19 08:49:18 -0500631 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632
Alex Elder0ce1a792012-07-03 16:01:18 -0500633 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634
Alex Elder0ce1a792012-07-03 16:01:18 -0500635 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800636 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500637 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800638 rbd_dev->snap_exists = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700639 rbd_dev->read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500641 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500643 u64 snap_id = 0;
644
645 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
646 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700647 if (ret < 0)
648 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500649 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800650 rbd_dev->snap_exists = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700651 rbd_dev->read_only = true; /* No choice for snapshots */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652 }
653
654 ret = 0;
655done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500656 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657 return ret;
658}
659
660static void rbd_header_free(struct rbd_image_header *header)
661{
Alex Elder849b4262012-07-09 21:04:24 -0500662 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500663 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500665 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500666 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500667 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800668 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500669 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670}
671
672/*
673 * get the actual striped segment name, offset and length
674 */
675static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500676 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677 u64 ofs, u64 len,
678 char *seg_name, u64 *segofs)
679{
680 u64 seg = ofs >> header->obj_order;
681
682 if (seg_name)
683 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500684 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685
686 ofs = ofs & ((1 << header->obj_order) - 1);
687 len = min_t(u64, len, (1 << header->obj_order) - ofs);
688
689 if (segofs)
690 *segofs = ofs;
691
692 return len;
693}
694
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700695static int rbd_get_num_segments(struct rbd_image_header *header,
696 u64 ofs, u64 len)
697{
Alex Elderdf111be2012-08-09 10:33:26 -0700698 u64 start_seg;
699 u64 end_seg;
700
701 if (!len)
702 return 0;
703 if (len - 1 > U64_MAX - ofs)
704 return -ERANGE;
705
706 start_seg = ofs >> header->obj_order;
707 end_seg = (ofs + len - 1) >> header->obj_order;
708
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700709 return end_seg - start_seg + 1;
710}
711
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700713 * returns the size of an object in the image
714 */
715static u64 rbd_obj_bytes(struct rbd_image_header *header)
716{
717 return 1 << header->obj_order;
718}
719
720/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721 * bio helpers
722 */
723
724static void bio_chain_put(struct bio *chain)
725{
726 struct bio *tmp;
727
728 while (chain) {
729 tmp = chain;
730 chain = chain->bi_next;
731 bio_put(tmp);
732 }
733}
734
735/*
736 * zeros a bio chain, starting at specific offset
737 */
738static void zero_bio_chain(struct bio *chain, int start_ofs)
739{
740 struct bio_vec *bv;
741 unsigned long flags;
742 void *buf;
743 int i;
744 int pos = 0;
745
746 while (chain) {
747 bio_for_each_segment(bv, chain, i) {
748 if (pos + bv->bv_len > start_ofs) {
749 int remainder = max(start_ofs - pos, 0);
750 buf = bvec_kmap_irq(bv, &flags);
751 memset(buf + remainder, 0,
752 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200753 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754 }
755 pos += bv->bv_len;
756 }
757
758 chain = chain->bi_next;
759 }
760}
761
762/*
763 * bio_chain_clone - clone a chain of bios up to a certain length.
764 * might return a bio_pair that will need to be released.
765 */
766static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
767 struct bio_pair **bp,
768 int len, gfp_t gfpmask)
769{
Alex Elder542582f2012-08-09 10:33:25 -0700770 struct bio *old_chain = *old;
771 struct bio *new_chain = NULL;
772 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 int total = 0;
774
775 if (*bp) {
776 bio_pair_release(*bp);
777 *bp = NULL;
778 }
779
780 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700781 struct bio *tmp;
782
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700783 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
784 if (!tmp)
785 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700786 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787
788 if (total + old_chain->bi_size > len) {
789 struct bio_pair *bp;
790
791 /*
792 * this split can only happen with a single paged bio,
793 * split_bio will BUG_ON if this is not the case
794 */
795 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500796 "bi_size=%u\n",
797 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798
799 /* split the bio. We'll release it either in the next
800 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600801 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700802 if (!bp)
803 goto err_out;
804
805 __bio_clone(tmp, &bp->bio1);
806
807 *next = &bp->bio2;
808 } else {
809 __bio_clone(tmp, old_chain);
810 *next = old_chain->bi_next;
811 }
812
813 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700815 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700817 else
818 new_chain = tmp;
819 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820 old_chain = old_chain->bi_next;
821
822 total += tmp->bi_size;
823 }
824
825 BUG_ON(total < len);
826
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827 *old = old_chain;
828
829 return new_chain;
830
831err_out:
832 dout("bio_chain_clone with err\n");
833 bio_chain_put(new_chain);
834 return NULL;
835}
836
837/*
838 * helpers for osd request op vectors.
839 */
Alex Elder57cfc102012-06-26 12:57:03 -0700840static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
841 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700842{
Alex Elder57cfc102012-06-26 12:57:03 -0700843 struct ceph_osd_req_op *ops;
844
845 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
846 if (!ops)
847 return NULL;
848
849 ops[0].op = opcode;
850
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851 /*
852 * op extent offset and length will be set later on
853 * in calc_raw_layout()
854 */
Alex Elder57cfc102012-06-26 12:57:03 -0700855 ops[0].payload_len = payload_len;
856
857 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700858}
859
860static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
861{
862 kfree(ops);
863}
864
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700865static void rbd_coll_end_req_index(struct request *rq,
866 struct rbd_req_coll *coll,
867 int index,
868 int ret, u64 len)
869{
870 struct request_queue *q;
871 int min, max, i;
872
Alex Elderbd919d42012-07-13 20:35:11 -0500873 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
874 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700875
876 if (!rq)
877 return;
878
879 if (!coll) {
880 blk_end_request(rq, ret, len);
881 return;
882 }
883
884 q = rq->q;
885
886 spin_lock_irq(q->queue_lock);
887 coll->status[index].done = 1;
888 coll->status[index].rc = ret;
889 coll->status[index].bytes = len;
890 max = min = coll->num_done;
891 while (max < coll->total && coll->status[max].done)
892 max++;
893
894 for (i = min; i<max; i++) {
895 __blk_end_request(rq, coll->status[i].rc,
896 coll->status[i].bytes);
897 coll->num_done++;
898 kref_put(&coll->kref, rbd_coll_release);
899 }
900 spin_unlock_irq(q->queue_lock);
901}
902
903static void rbd_coll_end_req(struct rbd_request *req,
904 int ret, u64 len)
905{
906 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
907}
908
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909/*
910 * Send ceph osd request
911 */
912static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500913 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700914 struct ceph_snap_context *snapc,
915 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500916 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917 struct bio *bio,
918 struct page **pages,
919 int num_pages,
920 int flags,
921 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700922 struct rbd_req_coll *coll,
923 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700925 struct ceph_msg *msg),
926 struct ceph_osd_request **linger_req,
927 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700928{
929 struct ceph_osd_request *req;
930 struct ceph_file_layout *layout;
931 int ret;
932 u64 bno;
933 struct timespec mtime = CURRENT_TIME;
934 struct rbd_request *req_data;
935 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600936 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700937
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700939 if (!req_data) {
940 if (coll)
941 rbd_coll_end_req_index(rq, coll, coll_index,
942 -ENOMEM, len);
943 return -ENOMEM;
944 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700945
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700946 if (coll) {
947 req_data->coll = coll;
948 req_data->coll_index = coll_index;
949 }
950
Alex Elderbd919d42012-07-13 20:35:11 -0500951 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
952 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953
Alex Elder0ce1a792012-07-03 16:01:18 -0500954 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600955 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
956 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700957 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700958 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959 goto done_pages;
960 }
961
962 req->r_callback = rbd_cb;
963
964 req_data->rq = rq;
965 req_data->bio = bio;
966 req_data->pages = pages;
967 req_data->len = len;
968
969 req->r_priv = req_data;
970
971 reqhead = req->r_request->front.iov_base;
972 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
973
Alex Elderaded07e2012-07-03 16:01:18 -0500974 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975 req->r_oid_len = strlen(req->r_oid);
976
977 layout = &req->r_file_layout;
978 memset(layout, 0, sizeof(*layout));
979 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
980 layout->fl_stripe_count = cpu_to_le32(1);
981 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500982 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600983 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
984 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985
986 ceph_osdc_build_request(req, ofs, &len,
987 ops,
988 snapc,
989 &mtime,
990 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700992 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600993 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700994 *linger_req = req;
995 }
996
Alex Elder1dbb4392012-01-24 10:08:37 -0600997 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700998 if (ret < 0)
999 goto done_err;
1000
1001 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001002 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001003 if (ver)
1004 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001005 dout("reassert_ver=%llu\n",
1006 (unsigned long long)
1007 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008 ceph_osdc_put_request(req);
1009 }
1010 return ret;
1011
1012done_err:
1013 bio_chain_put(req_data->bio);
1014 ceph_osdc_put_request(req);
1015done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001016 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018 return ret;
1019}
1020
1021/*
1022 * Ceph osd op callback
1023 */
1024static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1025{
1026 struct rbd_request *req_data = req->r_priv;
1027 struct ceph_osd_reply_head *replyhead;
1028 struct ceph_osd_op *op;
1029 __s32 rc;
1030 u64 bytes;
1031 int read_op;
1032
1033 /* parse reply */
1034 replyhead = msg->front.iov_base;
1035 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1036 op = (void *)(replyhead + 1);
1037 rc = le32_to_cpu(replyhead->result);
1038 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001039 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
Alex Elderbd919d42012-07-13 20:35:11 -05001041 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1042 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001043
1044 if (rc == -ENOENT && read_op) {
1045 zero_bio_chain(req_data->bio, 0);
1046 rc = 0;
1047 } else if (rc == 0 && read_op && bytes < req_data->len) {
1048 zero_bio_chain(req_data->bio, bytes);
1049 bytes = req_data->len;
1050 }
1051
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001052 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053
1054 if (req_data->bio)
1055 bio_chain_put(req_data->bio);
1056
1057 ceph_osdc_put_request(req);
1058 kfree(req_data);
1059}
1060
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001061static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1062{
1063 ceph_osdc_put_request(req);
1064}
1065
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066/*
1067 * Do a synchronous ceph osd operation
1068 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001069static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070 struct ceph_snap_context *snapc,
1071 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001073 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001074 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001076 char *buf,
1077 struct ceph_osd_request **linger_req,
1078 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079{
1080 int ret;
1081 struct page **pages;
1082 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001083
1084 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085
1086 num_pages = calc_pages_for(ofs , len);
1087 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001088 if (IS_ERR(pages))
1089 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090
Alex Elder0ce1a792012-07-03 16:01:18 -05001091 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001092 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093 pages, num_pages,
1094 flags,
1095 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001096 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001097 NULL,
1098 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001100 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101
1102 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1103 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1104
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105done:
1106 ceph_release_page_vector(pages, num_pages);
1107 return ret;
1108}
1109
1110/*
1111 * Do an asynchronous ceph osd operation
1112 */
1113static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001114 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115 struct ceph_snap_context *snapc,
1116 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001117 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001119 struct bio *bio,
1120 struct rbd_req_coll *coll,
1121 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122{
1123 char *seg_name;
1124 u64 seg_ofs;
1125 u64 seg_len;
1126 int ret;
1127 struct ceph_osd_req_op *ops;
1128 u32 payload_len;
1129
1130 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1131 if (!seg_name)
1132 return -ENOMEM;
1133
1134 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001135 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 ofs, len,
1137 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138
1139 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1140
Alex Elder57cfc102012-06-26 12:57:03 -07001141 ret = -ENOMEM;
1142 ops = rbd_create_rw_ops(1, opcode, payload_len);
1143 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144 goto done;
1145
1146 /* we've taken care of segment sizes earlier when we
1147 cloned the bios. We should never have a segment
1148 truncated at this point */
1149 BUG_ON(seg_len < len);
1150
1151 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1152 seg_name, seg_ofs, seg_len,
1153 bio,
1154 NULL, 0,
1155 flags,
1156 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001158 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001159
1160 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161done:
1162 kfree(seg_name);
1163 return ret;
1164}
1165
1166/*
1167 * Request async osd write
1168 */
1169static int rbd_req_write(struct request *rq,
1170 struct rbd_device *rbd_dev,
1171 struct ceph_snap_context *snapc,
1172 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001173 struct bio *bio,
1174 struct rbd_req_coll *coll,
1175 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001176{
1177 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1178 CEPH_OSD_OP_WRITE,
1179 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001180 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181}
1182
1183/*
1184 * Request async osd read
1185 */
1186static int rbd_req_read(struct request *rq,
1187 struct rbd_device *rbd_dev,
1188 u64 snapid,
1189 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001190 struct bio *bio,
1191 struct rbd_req_coll *coll,
1192 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193{
1194 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001195 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196 CEPH_OSD_OP_READ,
1197 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001198 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001199}
1200
1201/*
1202 * Request sync osd read
1203 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001204static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001206 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001208 char *buf,
1209 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210{
Alex Elder913d2fd2012-06-26 12:57:03 -07001211 struct ceph_osd_req_op *ops;
1212 int ret;
1213
1214 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1215 if (!ops)
1216 return -ENOMEM;
1217
1218 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001219 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001221 ops, object_name, ofs, len, buf, NULL, ver);
1222 rbd_destroy_ops(ops);
1223
1224 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225}
1226
1227/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228 * Request sync osd watch
1229 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001230static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001231 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001232 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001233{
1234 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001235 int ret;
1236
Alex Elder57cfc102012-06-26 12:57:03 -07001237 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1238 if (!ops)
1239 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001240
Josh Durgina71b8912011-12-05 18:10:44 -08001241 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001242 ops[0].watch.cookie = notify_id;
1243 ops[0].watch.flag = 0;
1244
Alex Elder0ce1a792012-07-03 16:01:18 -05001245 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001246 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001247 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001248 CEPH_OSD_FLAG_READ,
1249 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001250 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251 rbd_simple_req_cb, 0, NULL);
1252
1253 rbd_destroy_ops(ops);
1254 return ret;
1255}
1256
1257static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1258{
Alex Elder0ce1a792012-07-03 16:01:18 -05001259 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001260 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001261 int rc;
1262
Alex Elder0ce1a792012-07-03 16:01:18 -05001263 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264 return;
1265
Alex Elderbd919d42012-07-13 20:35:11 -05001266 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1267 rbd_dev->header_name, (unsigned long long) notify_id,
1268 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001269 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001270 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001271 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001272 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001273
Alex Elder7f0a24d2012-07-25 09:32:40 -05001274 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275}
1276
1277/*
1278 * Request sync osd watch
1279 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001280static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281{
1282 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001283 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001284 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285
Alex Elder57cfc102012-06-26 12:57:03 -07001286 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1287 if (!ops)
1288 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289
1290 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001291 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001292 if (ret < 0)
1293 goto fail;
1294
Alex Elder0e6f3222012-07-25 09:32:40 -05001295 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001297 ops[0].watch.flag = 1;
1298
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001301 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1302 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001303 rbd_dev->header_name,
1304 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001305 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001306
1307 if (ret < 0)
1308 goto fail_event;
1309
1310 rbd_destroy_ops(ops);
1311 return 0;
1312
1313fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001314 ceph_osdc_cancel_event(rbd_dev->watch_event);
1315 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316fail:
1317 rbd_destroy_ops(ops);
1318 return ret;
1319}
1320
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001321/*
1322 * Request sync osd unwatch
1323 */
Alex Elder070c6332012-07-25 09:32:41 -05001324static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001325{
1326 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001327 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001328
Alex Elder57cfc102012-06-26 12:57:03 -07001329 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1330 if (!ops)
1331 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001332
1333 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001335 ops[0].watch.flag = 0;
1336
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001338 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001339 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1340 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001341 rbd_dev->header_name,
1342 0, 0, NULL, NULL, NULL);
1343
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001344
1345 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001346 ceph_osdc_cancel_event(rbd_dev->watch_event);
1347 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001348 return ret;
1349}
1350
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001352 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353};
1354
1355static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1356{
Alex Elder0ce1a792012-07-03 16:01:18 -05001357 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1358 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001359 return;
1360
Alex Elderbd919d42012-07-13 20:35:11 -05001361 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1362 rbd_dev->header_name, (unsigned long long) notify_id,
1363 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001364}
1365
1366/*
1367 * Request sync osd notify
1368 */
Alex Elder4cb16252012-07-25 09:32:40 -05001369static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370{
1371 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001372 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001373 struct ceph_osd_event *event;
1374 struct rbd_notify_info info;
1375 int payload_len = sizeof(u32) + sizeof(u32);
1376 int ret;
1377
Alex Elder57cfc102012-06-26 12:57:03 -07001378 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1379 if (!ops)
1380 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001381
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383
1384 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1385 (void *)&info, &event);
1386 if (ret < 0)
1387 goto fail;
1388
1389 ops[0].watch.ver = 1;
1390 ops[0].watch.flag = 1;
1391 ops[0].watch.cookie = event->cookie;
1392 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1393 ops[0].watch.timeout = 12;
1394
Alex Elder0ce1a792012-07-03 16:01:18 -05001395 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001396 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001397 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1398 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001399 rbd_dev->header_name,
1400 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001401 if (ret < 0)
1402 goto fail_event;
1403
1404 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1405 dout("ceph_osdc_wait_event returned %d\n", ret);
1406 rbd_destroy_ops(ops);
1407 return 0;
1408
1409fail_event:
1410 ceph_osdc_cancel_event(event);
1411fail:
1412 rbd_destroy_ops(ops);
1413 return ret;
1414}
1415
1416/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417 * Request sync osd read
1418 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001419static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001420 const char *object_name,
1421 const char *class_name,
1422 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001423 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424 int len,
1425 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426{
1427 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001428 int class_name_len = strlen(class_name);
1429 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001430 int ret;
1431
1432 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001433 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001434 if (!ops)
1435 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001436
Alex Elderaded07e2012-07-03 16:01:18 -05001437 ops[0].cls.class_name = class_name;
1438 ops[0].cls.class_len = (__u8) class_name_len;
1439 ops[0].cls.method_name = method_name;
1440 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001441 ops[0].cls.argc = 0;
1442 ops[0].cls.indata = data;
1443 ops[0].cls.indata_len = len;
1444
Alex Elder0ce1a792012-07-03 16:01:18 -05001445 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1448 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001449 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450
1451 rbd_destroy_ops(ops);
1452
1453 dout("cls_exec returned %d\n", ret);
1454 return ret;
1455}
1456
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001457static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1458{
1459 struct rbd_req_coll *coll =
1460 kzalloc(sizeof(struct rbd_req_coll) +
1461 sizeof(struct rbd_req_status) * num_reqs,
1462 GFP_ATOMIC);
1463
1464 if (!coll)
1465 return NULL;
1466 coll->total = num_reqs;
1467 kref_init(&coll->kref);
1468 return coll;
1469}
1470
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471/*
1472 * block device queue callback
1473 */
1474static void rbd_rq_fn(struct request_queue *q)
1475{
1476 struct rbd_device *rbd_dev = q->queuedata;
1477 struct request *rq;
1478 struct bio_pair *bp = NULL;
1479
Alex Elder00f1f362012-02-07 12:03:36 -06001480 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 struct bio *bio;
1482 struct bio *rq_bio, *next_bio = NULL;
1483 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001484 unsigned int size;
1485 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001487 int num_segs, cur_seg = 0;
1488 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001489 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 dout("fetched request\n");
1492
1493 /* filter out block requests we don't understand */
1494 if ((rq->cmd_type != REQ_TYPE_FS)) {
1495 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001496 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 }
1498
1499 /* deduce our operation (read, write) */
1500 do_write = (rq_data_dir(rq) == WRITE);
1501
1502 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001503 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 rq_bio = rq->bio;
1505 if (do_write && rbd_dev->read_only) {
1506 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001507 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508 }
1509
1510 spin_unlock_irq(q->queue_lock);
1511
Josh Durgind1d25642011-12-05 14:03:05 -08001512 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001513
Josh Durgind1d25642011-12-05 14:03:05 -08001514 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001515 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001516 dout("request for non-existent snapshot");
1517 spin_lock_irq(q->queue_lock);
1518 __blk_end_request_all(rq, -ENXIO);
1519 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001520 }
1521
Josh Durgind1d25642011-12-05 14:03:05 -08001522 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1523
1524 up_read(&rbd_dev->header_rwsem);
1525
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 dout("%s 0x%x bytes at 0x%llx\n",
1527 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001528 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001531 if (num_segs <= 0) {
1532 spin_lock_irq(q->queue_lock);
1533 __blk_end_request_all(rq, num_segs);
1534 ceph_put_snap_context(snapc);
1535 continue;
1536 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 coll = rbd_alloc_coll(num_segs);
1538 if (!coll) {
1539 spin_lock_irq(q->queue_lock);
1540 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001541 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001542 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001543 }
1544
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 do {
1546 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001547 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001549 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001550 ofs, size,
1551 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001552 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1554 op_size, GFP_ATOMIC);
1555 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001556 rbd_coll_end_req_index(rq, coll, cur_seg,
1557 -ENOMEM, op_size);
1558 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001559 }
1560
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 /* init OSD command: write or read */
1563 if (do_write)
1564 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001565 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001567 op_size, bio,
1568 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569 else
1570 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001571 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001573 op_size, bio,
1574 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001576next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001577 size -= op_size;
1578 ofs += op_size;
1579
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001580 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 rq_bio = next_bio;
1582 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001583 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584
1585 if (bp)
1586 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001587 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001588
1589 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 }
1591}
1592
1593/*
1594 * a queue callback. Makes sure that we don't create a bio that spans across
1595 * multiple osd objects. One exception would be with a single page bios,
1596 * which we handle later at bio_chain_clone
1597 */
1598static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1599 struct bio_vec *bvec)
1600{
1601 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001602 unsigned int chunk_sectors;
1603 sector_t sector;
1604 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605 int max;
1606
Alex Elder593a9e72012-02-07 12:03:37 -06001607 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1608 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1609 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1610
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001611 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001612 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001613 if (max < 0)
1614 max = 0; /* bio_add cannot handle a negative return */
1615 if (max <= bvec->bv_len && bio_sectors == 0)
1616 return bvec->bv_len;
1617 return max;
1618}
1619
1620static void rbd_free_disk(struct rbd_device *rbd_dev)
1621{
1622 struct gendisk *disk = rbd_dev->disk;
1623
1624 if (!disk)
1625 return;
1626
1627 rbd_header_free(&rbd_dev->header);
1628
1629 if (disk->flags & GENHD_FL_UP)
1630 del_gendisk(disk);
1631 if (disk->queue)
1632 blk_cleanup_queue(disk->queue);
1633 put_disk(disk);
1634}
1635
1636/*
Alex Elder4156d992012-08-02 11:29:46 -05001637 * Read the complete header for the given rbd device.
1638 *
1639 * Returns a pointer to a dynamically-allocated buffer containing
1640 * the complete and validated header. Caller can pass the address
1641 * of a variable that will be filled in with the version of the
1642 * header object at the time it was read.
1643 *
1644 * Returns a pointer-coded errno if a failure occurs.
1645 */
1646static struct rbd_image_header_ondisk *
1647rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1648{
1649 struct rbd_image_header_ondisk *ondisk = NULL;
1650 u32 snap_count = 0;
1651 u64 names_size = 0;
1652 u32 want_count;
1653 int ret;
1654
1655 /*
1656 * The complete header will include an array of its 64-bit
1657 * snapshot ids, followed by the names of those snapshots as
1658 * a contiguous block of NUL-terminated strings. Note that
1659 * the number of snapshots could change by the time we read
1660 * it in, in which case we re-read it.
1661 */
1662 do {
1663 size_t size;
1664
1665 kfree(ondisk);
1666
1667 size = sizeof (*ondisk);
1668 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1669 size += names_size;
1670 ondisk = kmalloc(size, GFP_KERNEL);
1671 if (!ondisk)
1672 return ERR_PTR(-ENOMEM);
1673
1674 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1675 rbd_dev->header_name,
1676 0, size,
1677 (char *) ondisk, version);
1678
1679 if (ret < 0)
1680 goto out_err;
1681 if (WARN_ON((size_t) ret < size)) {
1682 ret = -ENXIO;
1683 pr_warning("short header read for image %s"
1684 " (want %zd got %d)\n",
1685 rbd_dev->image_name, size, ret);
1686 goto out_err;
1687 }
1688 if (!rbd_dev_ondisk_valid(ondisk)) {
1689 ret = -ENXIO;
1690 pr_warning("invalid header for image %s\n",
1691 rbd_dev->image_name);
1692 goto out_err;
1693 }
1694
1695 names_size = le64_to_cpu(ondisk->snap_names_len);
1696 want_count = snap_count;
1697 snap_count = le32_to_cpu(ondisk->snap_count);
1698 } while (snap_count != want_count);
1699
1700 return ondisk;
1701
1702out_err:
1703 kfree(ondisk);
1704
1705 return ERR_PTR(ret);
1706}
1707
1708/*
1709 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710 */
1711static int rbd_read_header(struct rbd_device *rbd_dev,
1712 struct rbd_image_header *header)
1713{
Alex Elder4156d992012-08-02 11:29:46 -05001714 struct rbd_image_header_ondisk *ondisk;
1715 u64 ver = 0;
1716 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
Alex Elder4156d992012-08-02 11:29:46 -05001718 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1719 if (IS_ERR(ondisk))
1720 return PTR_ERR(ondisk);
1721 ret = rbd_header_from_disk(header, ondisk);
1722 if (ret >= 0)
1723 header->obj_version = ver;
1724 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001725
Alex Elder4156d992012-08-02 11:29:46 -05001726 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727}
1728
1729/*
1730 * create a snapshot
1731 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001732static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 const char *snap_name,
1734 gfp_t gfp_flags)
1735{
1736 int name_len = strlen(snap_name);
1737 u64 new_snapid;
1738 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001739 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001740 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001741
1742 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001743 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 return -EINVAL;
1745
Alex Elder0ce1a792012-07-03 16:01:18 -05001746 monc = &rbd_dev->rbd_client->client->monc;
1747 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001748 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001749 if (ret < 0)
1750 return ret;
1751
1752 data = kmalloc(name_len + 16, gfp_flags);
1753 if (!data)
1754 return -ENOMEM;
1755
Sage Weil916d4d62011-05-12 16:10:50 -07001756 p = data;
1757 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001758
Sage Weil916d4d62011-05-12 16:10:50 -07001759 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1760 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761
Alex Elder0bed54d2012-07-03 16:01:18 -05001762 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001763 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001764 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001765
Sage Weil916d4d62011-05-12 16:10:50 -07001766 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767
Alex Elder505cbb92012-07-19 08:49:18 -05001768 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001769bad:
1770 return -ERANGE;
1771}
1772
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001773static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1774{
1775 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001776 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001777
Alex Eldera0593292012-07-19 09:09:27 -05001778 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001779 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001780}
1781
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001782/*
1783 * only read the first part of the ondisk header, without the snaps info
1784 */
Alex Elderb8136232012-07-25 09:32:41 -05001785static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786{
1787 int ret;
1788 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001789
1790 ret = rbd_read_header(rbd_dev, &h);
1791 if (ret < 0)
1792 return ret;
1793
Josh Durgina51aa0c2011-12-05 10:35:04 -08001794 down_write(&rbd_dev->header_rwsem);
1795
Sage Weil9db4b3e2011-04-19 22:49:06 -07001796 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001797 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1798 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1799
1800 dout("setting size to %llu sectors", (unsigned long long) size);
1801 set_capacity(rbd_dev->disk, size);
1802 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001803
Alex Elder849b4262012-07-09 21:04:24 -05001804 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001805 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001806 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001807 /* osd requests may still refer to snapc */
1808 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001809
Alex Elderb8136232012-07-25 09:32:41 -05001810 if (hver)
1811 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001812 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001813 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001814 rbd_dev->header.total_snaps = h.total_snaps;
1815 rbd_dev->header.snapc = h.snapc;
1816 rbd_dev->header.snap_names = h.snap_names;
1817 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001818 /* Free the extra copy of the object prefix */
1819 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1820 kfree(h.object_prefix);
1821
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001822 ret = __rbd_init_snaps_header(rbd_dev);
1823
Josh Durginc6666012011-11-21 17:11:12 -08001824 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001826 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827}
1828
Alex Elder1fe5e992012-07-25 09:32:41 -05001829static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1830{
1831 int ret;
1832
1833 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1834 ret = __rbd_refresh_header(rbd_dev, hver);
1835 mutex_unlock(&ctl_mutex);
1836
1837 return ret;
1838}
1839
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840static int rbd_init_disk(struct rbd_device *rbd_dev)
1841{
1842 struct gendisk *disk;
1843 struct request_queue *q;
1844 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001845 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001846 u64 total_size = 0;
1847
1848 /* contact OSD, request size info about the object being mapped */
1849 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1850 if (rc)
1851 return rc;
1852
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001853 /* no need to lock here, as rbd_dev is not registered yet */
1854 rc = __rbd_init_snaps_header(rbd_dev);
1855 if (rc)
1856 return rc;
1857
Josh Durgincc9d7342011-11-21 18:19:13 -08001858 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001859 if (rc)
1860 return rc;
1861
1862 /* create gendisk info */
1863 rc = -ENOMEM;
1864 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1865 if (!disk)
1866 goto out;
1867
Alex Elderf0f8cef2012-01-29 13:57:44 -06001868 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001869 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001870 disk->major = rbd_dev->major;
1871 disk->first_minor = 0;
1872 disk->fops = &rbd_bd_ops;
1873 disk->private_data = rbd_dev;
1874
1875 /* init rq */
1876 rc = -ENOMEM;
1877 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1878 if (!q)
1879 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001880
Alex Elder593a9e72012-02-07 12:03:37 -06001881 /* We use the default size, but let's be explicit about it. */
1882 blk_queue_physical_block_size(q, SECTOR_SIZE);
1883
Josh Durgin029bcbd2011-07-22 11:35:23 -07001884 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001885 segment_size = rbd_obj_bytes(&rbd_dev->header);
1886 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1887 blk_queue_max_segment_size(q, segment_size);
1888 blk_queue_io_min(q, segment_size);
1889 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001890
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001891 blk_queue_merge_bvec(q, rbd_merge_bvec);
1892 disk->queue = q;
1893
1894 q->queuedata = rbd_dev;
1895
1896 rbd_dev->disk = disk;
1897 rbd_dev->q = q;
1898
1899 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001900 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901 add_disk(disk);
1902
1903 pr_info("%s: added with size 0x%llx\n",
1904 disk->disk_name, (unsigned long long)total_size);
1905 return 0;
1906
1907out_disk:
1908 put_disk(disk);
1909out:
1910 return rc;
1911}
1912
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913/*
1914 sysfs
1915*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001916
Alex Elder593a9e72012-02-07 12:03:37 -06001917static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1918{
1919 return container_of(dev, struct rbd_device, dev);
1920}
1921
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001922static ssize_t rbd_size_show(struct device *dev,
1923 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924{
Alex Elder593a9e72012-02-07 12:03:37 -06001925 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001926 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001927
Josh Durgina51aa0c2011-12-05 10:35:04 -08001928 down_read(&rbd_dev->header_rwsem);
1929 size = get_capacity(rbd_dev->disk);
1930 up_read(&rbd_dev->header_rwsem);
1931
1932 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933}
1934
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935static ssize_t rbd_major_show(struct device *dev,
1936 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001937{
Alex Elder593a9e72012-02-07 12:03:37 -06001938 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939
1940 return sprintf(buf, "%d\n", rbd_dev->major);
1941}
1942
1943static ssize_t rbd_client_id_show(struct device *dev,
1944 struct device_attribute *attr, char *buf)
1945{
Alex Elder593a9e72012-02-07 12:03:37 -06001946 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947
Alex Elder1dbb4392012-01-24 10:08:37 -06001948 return sprintf(buf, "client%lld\n",
1949 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950}
1951
1952static ssize_t rbd_pool_show(struct device *dev,
1953 struct device_attribute *attr, char *buf)
1954{
Alex Elder593a9e72012-02-07 12:03:37 -06001955 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001956
1957 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1958}
1959
Alex Elder9bb2f332012-07-12 10:46:35 -05001960static ssize_t rbd_pool_id_show(struct device *dev,
1961 struct device_attribute *attr, char *buf)
1962{
1963 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1964
1965 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1966}
1967
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968static ssize_t rbd_name_show(struct device *dev,
1969 struct device_attribute *attr, char *buf)
1970{
Alex Elder593a9e72012-02-07 12:03:37 -06001971 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972
Alex Elder0bed54d2012-07-03 16:01:18 -05001973 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974}
1975
1976static ssize_t rbd_snap_show(struct device *dev,
1977 struct device_attribute *attr,
1978 char *buf)
1979{
Alex Elder593a9e72012-02-07 12:03:37 -06001980 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001981
1982 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1983}
1984
1985static ssize_t rbd_image_refresh(struct device *dev,
1986 struct device_attribute *attr,
1987 const char *buf,
1988 size_t size)
1989{
Alex Elder593a9e72012-02-07 12:03:37 -06001990 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001991 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001992
Alex Elder1fe5e992012-07-25 09:32:41 -05001993 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001994
1995 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001996}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001997
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001998static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1999static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2000static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2001static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002002static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002003static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2004static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2005static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2006static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002007
2008static struct attribute *rbd_attrs[] = {
2009 &dev_attr_size.attr,
2010 &dev_attr_major.attr,
2011 &dev_attr_client_id.attr,
2012 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002013 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014 &dev_attr_name.attr,
2015 &dev_attr_current_snap.attr,
2016 &dev_attr_refresh.attr,
2017 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002018 NULL
2019};
2020
2021static struct attribute_group rbd_attr_group = {
2022 .attrs = rbd_attrs,
2023};
2024
2025static const struct attribute_group *rbd_attr_groups[] = {
2026 &rbd_attr_group,
2027 NULL
2028};
2029
2030static void rbd_sysfs_dev_release(struct device *dev)
2031{
2032}
2033
2034static struct device_type rbd_device_type = {
2035 .name = "rbd",
2036 .groups = rbd_attr_groups,
2037 .release = rbd_sysfs_dev_release,
2038};
2039
2040
2041/*
2042 sysfs - snapshots
2043*/
2044
2045static ssize_t rbd_snap_size_show(struct device *dev,
2046 struct device_attribute *attr,
2047 char *buf)
2048{
2049 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2050
Josh Durgin35915382011-12-05 18:25:13 -08002051 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052}
2053
2054static ssize_t rbd_snap_id_show(struct device *dev,
2055 struct device_attribute *attr,
2056 char *buf)
2057{
2058 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2059
Josh Durgin35915382011-12-05 18:25:13 -08002060 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061}
2062
2063static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2064static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2065
2066static struct attribute *rbd_snap_attrs[] = {
2067 &dev_attr_snap_size.attr,
2068 &dev_attr_snap_id.attr,
2069 NULL,
2070};
2071
2072static struct attribute_group rbd_snap_attr_group = {
2073 .attrs = rbd_snap_attrs,
2074};
2075
2076static void rbd_snap_dev_release(struct device *dev)
2077{
2078 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2079 kfree(snap->name);
2080 kfree(snap);
2081}
2082
2083static const struct attribute_group *rbd_snap_attr_groups[] = {
2084 &rbd_snap_attr_group,
2085 NULL
2086};
2087
2088static struct device_type rbd_snap_device_type = {
2089 .groups = rbd_snap_attr_groups,
2090 .release = rbd_snap_dev_release,
2091};
2092
Alex Elder14e70852012-07-19 09:09:27 -05002093static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094{
2095 list_del(&snap->node);
2096 device_unregister(&snap->dev);
2097}
2098
Alex Elder14e70852012-07-19 09:09:27 -05002099static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100 struct device *parent)
2101{
2102 struct device *dev = &snap->dev;
2103 int ret;
2104
2105 dev->type = &rbd_snap_device_type;
2106 dev->parent = parent;
2107 dev->release = rbd_snap_dev_release;
2108 dev_set_name(dev, "snap_%s", snap->name);
2109 ret = device_register(dev);
2110
2111 return ret;
2112}
2113
Alex Elder4e891e02012-07-10 20:30:10 -05002114static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2115 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116{
Alex Elder4e891e02012-07-10 20:30:10 -05002117 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002119
2120 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002121 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002122 return ERR_PTR(-ENOMEM);
2123
2124 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002126 if (!snap->name)
2127 goto err;
2128
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002129 snap->size = rbd_dev->header.snap_sizes[i];
2130 snap->id = rbd_dev->header.snapc->snaps[i];
2131 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002132 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133 if (ret < 0)
2134 goto err;
2135 }
Alex Elder4e891e02012-07-10 20:30:10 -05002136
2137 return snap;
2138
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139err:
2140 kfree(snap->name);
2141 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002142
2143 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144}
2145
2146/*
Alex Elder35938152012-08-02 11:29:46 -05002147 * Scan the rbd device's current snapshot list and compare it to the
2148 * newly-received snapshot context. Remove any existing snapshots
2149 * not present in the new snapshot context. Add a new snapshot for
2150 * any snaphots in the snapshot context not in the current list.
2151 * And verify there are no changes to snapshots we already know
2152 * about.
2153 *
2154 * Assumes the snapshots in the snapshot context are sorted by
2155 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2156 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157 */
2158static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2159{
Alex Elder35938152012-08-02 11:29:46 -05002160 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2161 const u32 snap_count = snapc->num_snaps;
2162 char *snap_name = rbd_dev->header.snap_names;
2163 struct list_head *head = &rbd_dev->snaps;
2164 struct list_head *links = head->next;
2165 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166
Alex Elder35938152012-08-02 11:29:46 -05002167 while (index < snap_count || links != head) {
2168 u64 snap_id;
2169 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170
Alex Elder35938152012-08-02 11:29:46 -05002171 snap_id = index < snap_count ? snapc->snaps[index]
2172 : CEPH_NOSNAP;
2173 snap = links != head ? list_entry(links, struct rbd_snap, node)
2174 : NULL;
2175 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176
Alex Elder35938152012-08-02 11:29:46 -05002177 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2178 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179
Alex Elder35938152012-08-02 11:29:46 -05002180 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002181
Alex Elder35938152012-08-02 11:29:46 -05002182 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002183 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002184 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185
Alex Elder35938152012-08-02 11:29:46 -05002186 /* Done with this list entry; advance */
2187
2188 links = next;
2189 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190 }
Alex Elder35938152012-08-02 11:29:46 -05002191
2192 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2193 struct rbd_snap *new_snap;
2194
2195 /* We haven't seen this snapshot before */
2196
2197 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2198 snap_name);
2199 if (IS_ERR(new_snap))
2200 return PTR_ERR(new_snap);
2201
2202 /* New goes before existing, or at end of list */
2203
2204 if (snap)
2205 list_add_tail(&new_snap->node, &snap->node);
2206 else
Alex Elder523f3252012-08-30 00:16:37 -05002207 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002208 } else {
2209 /* Already have this one */
2210
2211 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2212 BUG_ON(strcmp(snap->name, snap_name));
2213
2214 /* Done with this list entry; advance */
2215
2216 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002217 }
Alex Elder35938152012-08-02 11:29:46 -05002218
2219 /* Advance to the next entry in the snapshot context */
2220
2221 index++;
2222 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002223 }
2224
2225 return 0;
2226}
2227
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002228static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2229{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002230 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231 struct device *dev;
2232 struct rbd_snap *snap;
2233
2234 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2235 dev = &rbd_dev->dev;
2236
2237 dev->bus = &rbd_bus_type;
2238 dev->type = &rbd_device_type;
2239 dev->parent = &rbd_root_dev;
2240 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002241 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002242 ret = device_register(dev);
2243 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002244 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002245
2246 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002247 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002248 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002249 break;
2250 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002251out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002252 mutex_unlock(&ctl_mutex);
2253 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002254}
2255
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002256static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2257{
2258 device_unregister(&rbd_dev->dev);
2259}
2260
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002261static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2262{
2263 int ret, rc;
2264
2265 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002266 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002267 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002268 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002269 if (rc < 0)
2270 return rc;
2271 }
2272 } while (ret == -ERANGE);
2273
2274 return ret;
2275}
2276
Alex Elder1ddbe942012-01-29 13:57:44 -06002277static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2278
2279/*
Alex Elder499afd52012-02-02 08:13:29 -06002280 * Get a unique rbd identifier for the given new rbd_dev, and add
2281 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002282 */
Alex Elder499afd52012-02-02 08:13:29 -06002283static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002284{
Alex Elderde71a292012-07-03 16:01:19 -05002285 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002286
2287 spin_lock(&rbd_dev_list_lock);
2288 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2289 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002290}
Alex Elderb7f23c32012-01-29 13:57:43 -06002291
Alex Elder1ddbe942012-01-29 13:57:44 -06002292/*
Alex Elder499afd52012-02-02 08:13:29 -06002293 * Remove an rbd_dev from the global list, and record that its
2294 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002295 */
Alex Elder499afd52012-02-02 08:13:29 -06002296static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002297{
Alex Elderd184f6b2012-01-29 13:57:44 -06002298 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002299 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002300 int max_id;
2301
2302 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002303
2304 spin_lock(&rbd_dev_list_lock);
2305 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002306
2307 /*
2308 * If the id being "put" is not the current maximum, there
2309 * is nothing special we need to do.
2310 */
2311 if (rbd_id != atomic64_read(&rbd_id_max)) {
2312 spin_unlock(&rbd_dev_list_lock);
2313 return;
2314 }
2315
2316 /*
2317 * We need to update the current maximum id. Search the
2318 * list to find out what it is. We're more likely to find
2319 * the maximum at the end, so search the list backward.
2320 */
2321 max_id = 0;
2322 list_for_each_prev(tmp, &rbd_dev_list) {
2323 struct rbd_device *rbd_dev;
2324
2325 rbd_dev = list_entry(tmp, struct rbd_device, node);
2326 if (rbd_id > max_id)
2327 max_id = rbd_id;
2328 }
Alex Elder499afd52012-02-02 08:13:29 -06002329 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002330
Alex Elder1ddbe942012-01-29 13:57:44 -06002331 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002332 * The max id could have been updated by rbd_id_get(), in
2333 * which case it now accurately reflects the new maximum.
2334 * Be careful not to overwrite the maximum value in that
2335 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002336 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002337 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002338}
2339
Alex Eldera725f65e2012-02-02 08:13:30 -06002340/*
Alex Eldere28fff262012-02-02 08:13:30 -06002341 * Skips over white space at *buf, and updates *buf to point to the
2342 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002343 * the token (string of non-white space characters) found. Note
2344 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002345 */
2346static inline size_t next_token(const char **buf)
2347{
2348 /*
2349 * These are the characters that produce nonzero for
2350 * isspace() in the "C" and "POSIX" locales.
2351 */
2352 const char *spaces = " \f\n\r\t\v";
2353
2354 *buf += strspn(*buf, spaces); /* Find start of token */
2355
2356 return strcspn(*buf, spaces); /* Return token length */
2357}
2358
2359/*
2360 * Finds the next token in *buf, and if the provided token buffer is
2361 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002362 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2363 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002364 *
2365 * Returns the length of the token found (not including the '\0').
2366 * Return value will be 0 if no token is found, and it will be >=
2367 * token_size if the token would not fit.
2368 *
Alex Elder593a9e72012-02-07 12:03:37 -06002369 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002370 * found token. Note that this occurs even if the token buffer is
2371 * too small to hold it.
2372 */
2373static inline size_t copy_token(const char **buf,
2374 char *token,
2375 size_t token_size)
2376{
2377 size_t len;
2378
2379 len = next_token(buf);
2380 if (len < token_size) {
2381 memcpy(token, *buf, len);
2382 *(token + len) = '\0';
2383 }
2384 *buf += len;
2385
2386 return len;
2387}
2388
2389/*
Alex Elderea3352f2012-07-09 21:04:23 -05002390 * Finds the next token in *buf, dynamically allocates a buffer big
2391 * enough to hold a copy of it, and copies the token into the new
2392 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2393 * that a duplicate buffer is created even for a zero-length token.
2394 *
2395 * Returns a pointer to the newly-allocated duplicate, or a null
2396 * pointer if memory for the duplicate was not available. If
2397 * the lenp argument is a non-null pointer, the length of the token
2398 * (not including the '\0') is returned in *lenp.
2399 *
2400 * If successful, the *buf pointer will be updated to point beyond
2401 * the end of the found token.
2402 *
2403 * Note: uses GFP_KERNEL for allocation.
2404 */
2405static inline char *dup_token(const char **buf, size_t *lenp)
2406{
2407 char *dup;
2408 size_t len;
2409
2410 len = next_token(buf);
2411 dup = kmalloc(len + 1, GFP_KERNEL);
2412 if (!dup)
2413 return NULL;
2414
2415 memcpy(dup, *buf, len);
2416 *(dup + len) = '\0';
2417 *buf += len;
2418
2419 if (lenp)
2420 *lenp = len;
2421
2422 return dup;
2423}
2424
2425/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002426 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002427 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2428 * on the list of monitor addresses and other options provided via
2429 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002430 *
2431 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002432 */
2433static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2434 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002435 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002436 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002437 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002438 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002439{
Alex Elderd22f76e2012-07-12 10:46:35 -05002440 size_t len;
2441 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002442
2443 /* The first four tokens are required */
2444
Alex Elder7ef32142012-02-02 08:13:30 -06002445 len = next_token(&buf);
2446 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002447 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002448 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002449 *mon_addrs = buf;
2450
2451 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002452
Alex Eldere28fff262012-02-02 08:13:30 -06002453 len = copy_token(&buf, options, options_size);
2454 if (!len || len >= options_size)
2455 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002456
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002457 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002458 rbd_dev->pool_name = dup_token(&buf, NULL);
2459 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002460 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002461
Alex Elder0bed54d2012-07-03 16:01:18 -05002462 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2463 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002464 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002465
Alex Eldercb8627c2012-07-09 21:04:23 -05002466 /* Create the name of the header object */
2467
Alex Elder0bed54d2012-07-03 16:01:18 -05002468 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002469 + sizeof (RBD_SUFFIX),
2470 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002471 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002472 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002473 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002474
Alex Eldere28fff262012-02-02 08:13:30 -06002475 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002476 * The snapshot name is optional. If none is is supplied,
2477 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002478 */
Alex Elder820a5f32012-07-09 21:04:24 -05002479 rbd_dev->snap_name = dup_token(&buf, &len);
2480 if (!rbd_dev->snap_name)
2481 goto out_err;
2482 if (!len) {
2483 /* Replace the empty name with the default */
2484 kfree(rbd_dev->snap_name);
2485 rbd_dev->snap_name
2486 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2487 if (!rbd_dev->snap_name)
2488 goto out_err;
2489
Alex Eldere28fff262012-02-02 08:13:30 -06002490 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2491 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002492 }
Alex Eldere28fff262012-02-02 08:13:30 -06002493
Alex Eldera725f65e2012-02-02 08:13:30 -06002494 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002495
2496out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002497 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002498 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002499 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002500 rbd_dev->image_name = NULL;
2501 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002502 kfree(rbd_dev->pool_name);
2503 rbd_dev->pool_name = NULL;
2504
2505 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002506}
2507
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002508static ssize_t rbd_add(struct bus_type *bus,
2509 const char *buf,
2510 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002511{
Alex Eldercb8627c2012-07-09 21:04:23 -05002512 char *options;
2513 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002514 const char *mon_addrs = NULL;
2515 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002516 struct ceph_osd_client *osdc;
2517 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002518
2519 if (!try_module_get(THIS_MODULE))
2520 return -ENODEV;
2521
Alex Elder27cc2592012-02-02 08:13:30 -06002522 options = kmalloc(count, GFP_KERNEL);
2523 if (!options)
2524 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002525 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2526 if (!rbd_dev)
2527 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002528
2529 /* static rbd_device initialization */
2530 spin_lock_init(&rbd_dev->lock);
2531 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002532 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002533 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002534
Alex Elderd184f6b2012-01-29 13:57:44 -06002535 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002536 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002537
Alex Eldera725f65e2012-02-02 08:13:30 -06002538 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002539 BUILD_BUG_ON(DEV_NAME_LEN
2540 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002541 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002542
Alex Eldera725f65e2012-02-02 08:13:30 -06002543 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002544 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002545 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002546 if (rc)
2547 goto err_put_id;
2548
Alex Elderf8c38922012-08-10 13:12:07 -07002549 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2550 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002551 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002552
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002554 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002555 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2556 if (rc < 0)
2557 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002558 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002559
2560 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002561 rc = register_blkdev(0, rbd_dev->name);
2562 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002564 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002565
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002566 rc = rbd_bus_add_dev(rbd_dev);
2567 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002568 goto err_out_blkdev;
2569
Alex Elder32eec682012-02-08 16:11:14 -06002570 /*
2571 * At this point cleanup in the event of an error is the job
2572 * of the sysfs code (initiated by rbd_bus_del_dev()).
2573 *
2574 * Set up and announce blkdev mapping.
2575 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002576 rc = rbd_init_disk(rbd_dev);
2577 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002578 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002579
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002580 rc = rbd_init_watch_dev(rbd_dev);
2581 if (rc)
2582 goto err_out_bus;
2583
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002584 return count;
2585
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002586err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002587 /* this will also clean up rest of rbd_dev stuff */
2588
2589 rbd_bus_del_dev(rbd_dev);
2590 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002591 return rc;
2592
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002593err_out_blkdev:
2594 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2595err_out_client:
2596 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002597err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002598 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002599 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002600 kfree(rbd_dev->header_name);
2601 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002602 kfree(rbd_dev->pool_name);
2603 }
Alex Elder499afd52012-02-02 08:13:29 -06002604 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002605err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002606 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002607 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002608
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002609 dout("Error adding device %s\n", buf);
2610 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002611
2612 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002613}
2614
Alex Elderde71a292012-07-03 16:01:19 -05002615static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002616{
2617 struct list_head *tmp;
2618 struct rbd_device *rbd_dev;
2619
Alex Eldere124a822012-01-29 13:57:44 -06002620 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002621 list_for_each(tmp, &rbd_dev_list) {
2622 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002623 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002624 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002625 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002626 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627 }
Alex Eldere124a822012-01-29 13:57:44 -06002628 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629 return NULL;
2630}
2631
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002632static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002633{
Alex Elder593a9e72012-02-07 12:03:37 -06002634 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002635
Alex Elder1dbb4392012-01-24 10:08:37 -06002636 if (rbd_dev->watch_request) {
2637 struct ceph_client *client = rbd_dev->rbd_client->client;
2638
2639 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002640 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002641 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002642 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002643 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002644
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002645 rbd_put_client(rbd_dev);
2646
2647 /* clean up and free blkdev */
2648 rbd_free_disk(rbd_dev);
2649 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002650
2651 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002652 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002653 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002654 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002655 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002656 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002657 kfree(rbd_dev);
2658
2659 /* release module ref */
2660 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002661}
2662
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002663static ssize_t rbd_remove(struct bus_type *bus,
2664 const char *buf,
2665 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666{
2667 struct rbd_device *rbd_dev = NULL;
2668 int target_id, rc;
2669 unsigned long ul;
2670 int ret = count;
2671
2672 rc = strict_strtoul(buf, 10, &ul);
2673 if (rc)
2674 return rc;
2675
2676 /* convert to int; abort if we lost anything in the conversion */
2677 target_id = (int) ul;
2678 if (target_id != ul)
2679 return -EINVAL;
2680
2681 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2682
2683 rbd_dev = __rbd_get_dev(target_id);
2684 if (!rbd_dev) {
2685 ret = -ENOENT;
2686 goto done;
2687 }
2688
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002689 __rbd_remove_all_snaps(rbd_dev);
2690 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691
2692done:
2693 mutex_unlock(&ctl_mutex);
2694 return ret;
2695}
2696
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697static ssize_t rbd_snap_add(struct device *dev,
2698 struct device_attribute *attr,
2699 const char *buf,
2700 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002701{
Alex Elder593a9e72012-02-07 12:03:37 -06002702 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002703 int ret;
2704 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002705 if (!name)
2706 return -ENOMEM;
2707
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002708 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002709
2710 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2711
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002712 ret = rbd_header_add_snap(rbd_dev,
2713 name, GFP_KERNEL);
2714 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002715 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002716
Alex Elderb8136232012-07-25 09:32:41 -05002717 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002719 goto err_unlock;
2720
2721 /* shouldn't hold ctl_mutex when notifying.. notify might
2722 trigger a watch callback that would need to get that mutex */
2723 mutex_unlock(&ctl_mutex);
2724
2725 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002726 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002727
2728 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002729 kfree(name);
2730 return ret;
2731
2732err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002733 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002734 kfree(name);
2735 return ret;
2736}
2737
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002738/*
2739 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002740 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002741 */
2742static int rbd_sysfs_init(void)
2743{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002744 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002745
Alex Elderfed4c142012-02-07 12:03:36 -06002746 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002747 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002748 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002749
Alex Elderfed4c142012-02-07 12:03:36 -06002750 ret = bus_register(&rbd_bus_type);
2751 if (ret < 0)
2752 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002753
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002754 return ret;
2755}
2756
2757static void rbd_sysfs_cleanup(void)
2758{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002759 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002760 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761}
2762
2763int __init rbd_init(void)
2764{
2765 int rc;
2766
2767 rc = rbd_sysfs_init();
2768 if (rc)
2769 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002770 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002771 return 0;
2772}
2773
2774void __exit rbd_exit(void)
2775{
2776 rbd_sysfs_cleanup();
2777}
2778
2779module_init(rbd_init);
2780module_exit(rbd_exit);
2781
2782MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2783MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2784MODULE_DESCRIPTION("rados block device");
2785
2786/* following authorship retained from original osdblk.c */
2787MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2788
2789MODULE_LICENSE("GPL");