blob: 5bcd4ebb22e771b7a5b6d44ba00aa2303402b2e1 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d862012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d862012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
Alex Elder43ae4702012-07-03 16:01:18 -0500327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328{
329 struct rbd_client *client_node;
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500335 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 return client_node;
337 return NULL;
338}
339
340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
Alex Elder43ae4702012-07-03 16:01:18 -0500351static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
Alex Elder43ae4702012-07-03 16:01:18 -0500360 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
Alex Elder43ae4702012-07-03 16:01:18 -0500364 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500385 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
393/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400{
401 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500402 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600407 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elder43ae4702012-07-03 16:01:18 -0500411 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600415 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500416 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600417 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500420 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600422 /* using an existing client */
423 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600424 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600425
Alex Elder43ae4702012-07-03 16:01:18 -0500426 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600427 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elderd720bcb2012-02-02 08:13:30 -0600429 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 }
Alex Elder432b8582012-01-29 13:57:44 -0600431 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elder43ae4702012-07-03 16:01:18 -0500433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
441/*
442 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600443 *
Alex Elder432b8582012-01-29 13:57:44 -0600444 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500451 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454
455 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700456 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elder8e94af82012-07-25 09:32:40 -0500482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{
Alex Elder103a1502012-08-02 11:29:45 -0500484 size_t size;
485 u32 snap_count;
486
487 /* The header has to start with the magic rbd header text */
488 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
489 return false;
490
491 /*
492 * The size of a snapshot header has to fit in a size_t, and
493 * that limits the number of snapshots.
494 */
495 snap_count = le32_to_cpu(ondisk->snap_count);
496 size = SIZE_MAX - sizeof (struct ceph_snap_context);
497 if (snap_count > size / sizeof (__le64))
498 return false;
499
500 /*
501 * Not only that, but the size of the entire the snapshot
502 * header must also be representable in a size_t.
503 */
504 size -= snap_count * sizeof (__le64);
505 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
506 return false;
507
508 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500509}
510
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511/*
512 * Create a new header structure, translate header format from the on-disk
513 * header.
514 */
515static int rbd_header_from_disk(struct rbd_image_header *header,
516 struct rbd_image_header_ondisk *ondisk,
Alex Eldered63f4f2012-07-19 09:09:27 -0500517 u32 allocated_snaps)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518{
Alex Elderccece232012-07-10 20:30:10 -0500519 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500520 size_t size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521
Alex Elder8e94af82012-07-25 09:32:40 -0500522 if (!rbd_dev_ondisk_valid(ondisk))
Josh Durgin81e759f2011-11-15 14:49:53 -0800523 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800524
Alex Elder6a523252012-07-19 17:12:59 -0500525 memset(header, 0, sizeof (*header));
526
Alex Elder103a1502012-08-02 11:29:45 -0500527 snap_count = le32_to_cpu(ondisk->snap_count);
528
Alex Elder6a523252012-07-19 17:12:59 -0500529 size = sizeof (ondisk->block_name) + 1;
530 header->object_prefix = kmalloc(size, GFP_KERNEL);
531 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532 return -ENOMEM;
Alex Elder6a523252012-07-19 17:12:59 -0500533 memcpy(header->object_prefix, ondisk->block_name, size - 1);
534 header->object_prefix[size - 1] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600535
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700536 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500537 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500538 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500540 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700541 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500542 goto out_err;
543
Alex Elderd2bb24e2012-07-26 23:37:14 -0500544 size = snap_count * sizeof (*header->snap_sizes);
545 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500547 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548 } else {
Alex Elderccece232012-07-10 20:30:10 -0500549 WARN_ON(ondisk->snap_names_len);
550 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 header->snap_names = NULL;
552 header->snap_sizes = NULL;
553 }
Alex Elder849b4262012-07-09 21:04:24 -0500554
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700555 header->image_size = le64_to_cpu(ondisk->image_size);
556 header->obj_order = ondisk->options.order;
557 header->crypt_type = ondisk->options.crypt_type;
558 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500559 header->total_snaps = snap_count;
560
Alex Elder28cb7752012-07-26 23:37:15 -0500561 /*
562 * If the number of snapshot ids provided by the caller
563 * doesn't match the number in the entire context there's
564 * no point in going further. Caller will try again after
565 * getting an updated snapshot context from the server.
566 */
567 if (allocated_snaps != snap_count)
568 return 0;
Alex Elder6a523252012-07-19 17:12:59 -0500569
570 size = sizeof (struct ceph_snap_context);
571 size += snap_count * sizeof (header->snapc->snaps[0]);
572 header->snapc = kzalloc(size, GFP_KERNEL);
573 if (!header->snapc)
574 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
576 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500577 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 header->snapc->num_snaps = snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700579
Alex Elder28cb7752012-07-26 23:37:15 -0500580 /* Fill in the snapshot information */
581
582 if (snap_count) {
583 u32 i;
Alex Elderccece232012-07-10 20:30:10 -0500584
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585 for (i = 0; i < snap_count; i++) {
586 header->snapc->snaps[i] =
587 le64_to_cpu(ondisk->snaps[i].id);
588 header->snap_sizes[i] =
589 le64_to_cpu(ondisk->snaps[i].image_size);
590 }
591
592 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500593 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700594 header->snap_names_len);
595 }
596
597 return 0;
598
Alex Elder6a523252012-07-19 17:12:59 -0500599out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500600 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500601 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500603 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500604 header->snap_names_len = 0;
Alex Elder6a523252012-07-19 17:12:59 -0500605 kfree(header->object_prefix);
606 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500607
Alex Elder00f1f362012-02-07 12:03:36 -0600608 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609}
610
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
612 u64 *seq, u64 *size)
613{
614 int i;
615 char *p = header->snap_names;
616
Alex Elder00f1f362012-02-07 12:03:36 -0600617 for (i = 0; i < header->total_snaps; i++) {
618 if (!strcmp(snap_name, p)) {
619
620 /* Found it. Pass back its id and/or size */
621
622 if (seq)
623 *seq = header->snapc->snaps[i];
624 if (size)
625 *size = header->snap_sizes[i];
626 return i;
627 }
628 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 }
Alex Elder00f1f362012-02-07 12:03:36 -0600630 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631}
632
Alex Elder0ce1a792012-07-03 16:01:18 -0500633static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634{
Alex Elder78dc4472012-07-19 08:49:18 -0500635 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
Alex Elder0ce1a792012-07-03 16:01:18 -0500637 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638
Alex Elder0ce1a792012-07-03 16:01:18 -0500639 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800640 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500641 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800642 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500643 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500645 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500647 u64 snap_id = 0;
648
649 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
650 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651 if (ret < 0)
652 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500653 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800654 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500655 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 }
657
658 ret = 0;
659done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500660 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661 return ret;
662}
663
664static void rbd_header_free(struct rbd_image_header *header)
665{
Alex Elder849b4262012-07-09 21:04:24 -0500666 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500667 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500669 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500670 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500671 header->snap_names = NULL;
672 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800673 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500674 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675}
676
677/*
678 * get the actual striped segment name, offset and length
679 */
680static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500681 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682 u64 ofs, u64 len,
683 char *seg_name, u64 *segofs)
684{
685 u64 seg = ofs >> header->obj_order;
686
687 if (seg_name)
688 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500689 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690
691 ofs = ofs & ((1 << header->obj_order) - 1);
692 len = min_t(u64, len, (1 << header->obj_order) - ofs);
693
694 if (segofs)
695 *segofs = ofs;
696
697 return len;
698}
699
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700700static int rbd_get_num_segments(struct rbd_image_header *header,
701 u64 ofs, u64 len)
702{
703 u64 start_seg = ofs >> header->obj_order;
704 u64 end_seg = (ofs + len - 1) >> header->obj_order;
705 return end_seg - start_seg + 1;
706}
707
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700709 * returns the size of an object in the image
710 */
711static u64 rbd_obj_bytes(struct rbd_image_header *header)
712{
713 return 1 << header->obj_order;
714}
715
716/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700717 * bio helpers
718 */
719
720static void bio_chain_put(struct bio *chain)
721{
722 struct bio *tmp;
723
724 while (chain) {
725 tmp = chain;
726 chain = chain->bi_next;
727 bio_put(tmp);
728 }
729}
730
731/*
732 * zeros a bio chain, starting at specific offset
733 */
734static void zero_bio_chain(struct bio *chain, int start_ofs)
735{
736 struct bio_vec *bv;
737 unsigned long flags;
738 void *buf;
739 int i;
740 int pos = 0;
741
742 while (chain) {
743 bio_for_each_segment(bv, chain, i) {
744 if (pos + bv->bv_len > start_ofs) {
745 int remainder = max(start_ofs - pos, 0);
746 buf = bvec_kmap_irq(bv, &flags);
747 memset(buf + remainder, 0,
748 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200749 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750 }
751 pos += bv->bv_len;
752 }
753
754 chain = chain->bi_next;
755 }
756}
757
758/*
759 * bio_chain_clone - clone a chain of bios up to a certain length.
760 * might return a bio_pair that will need to be released.
761 */
762static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
763 struct bio_pair **bp,
764 int len, gfp_t gfpmask)
765{
766 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
767 int total = 0;
768
769 if (*bp) {
770 bio_pair_release(*bp);
771 *bp = NULL;
772 }
773
774 while (old_chain && (total < len)) {
775 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
776 if (!tmp)
777 goto err_out;
778
779 if (total + old_chain->bi_size > len) {
780 struct bio_pair *bp;
781
782 /*
783 * this split can only happen with a single paged bio,
784 * split_bio will BUG_ON if this is not the case
785 */
786 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500787 "bi_size=%u\n",
788 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700789
790 /* split the bio. We'll release it either in the next
791 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600792 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793 if (!bp)
794 goto err_out;
795
796 __bio_clone(tmp, &bp->bio1);
797
798 *next = &bp->bio2;
799 } else {
800 __bio_clone(tmp, old_chain);
801 *next = old_chain->bi_next;
802 }
803
804 tmp->bi_bdev = NULL;
805 gfpmask &= ~__GFP_WAIT;
806 tmp->bi_next = NULL;
807
808 if (!new_chain) {
809 new_chain = tail = tmp;
810 } else {
811 tail->bi_next = tmp;
812 tail = tmp;
813 }
814 old_chain = old_chain->bi_next;
815
816 total += tmp->bi_size;
817 }
818
819 BUG_ON(total < len);
820
821 if (tail)
822 tail->bi_next = NULL;
823
824 *old = old_chain;
825
826 return new_chain;
827
828err_out:
829 dout("bio_chain_clone with err\n");
830 bio_chain_put(new_chain);
831 return NULL;
832}
833
834/*
835 * helpers for osd request op vectors.
836 */
Alex Elder57cfc102012-06-26 12:57:03 -0700837static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
838 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700839{
Alex Elder57cfc102012-06-26 12:57:03 -0700840 struct ceph_osd_req_op *ops;
841
842 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
843 if (!ops)
844 return NULL;
845
846 ops[0].op = opcode;
847
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848 /*
849 * op extent offset and length will be set later on
850 * in calc_raw_layout()
851 */
Alex Elder57cfc102012-06-26 12:57:03 -0700852 ops[0].payload_len = payload_len;
853
854 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855}
856
857static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
858{
859 kfree(ops);
860}
861
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700862static void rbd_coll_end_req_index(struct request *rq,
863 struct rbd_req_coll *coll,
864 int index,
865 int ret, u64 len)
866{
867 struct request_queue *q;
868 int min, max, i;
869
Alex Elderbd919d42012-07-13 20:35:11 -0500870 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
871 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700872
873 if (!rq)
874 return;
875
876 if (!coll) {
877 blk_end_request(rq, ret, len);
878 return;
879 }
880
881 q = rq->q;
882
883 spin_lock_irq(q->queue_lock);
884 coll->status[index].done = 1;
885 coll->status[index].rc = ret;
886 coll->status[index].bytes = len;
887 max = min = coll->num_done;
888 while (max < coll->total && coll->status[max].done)
889 max++;
890
891 for (i = min; i<max; i++) {
892 __blk_end_request(rq, coll->status[i].rc,
893 coll->status[i].bytes);
894 coll->num_done++;
895 kref_put(&coll->kref, rbd_coll_release);
896 }
897 spin_unlock_irq(q->queue_lock);
898}
899
900static void rbd_coll_end_req(struct rbd_request *req,
901 int ret, u64 len)
902{
903 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
904}
905
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906/*
907 * Send ceph osd request
908 */
909static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500910 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911 struct ceph_snap_context *snapc,
912 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500913 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700914 struct bio *bio,
915 struct page **pages,
916 int num_pages,
917 int flags,
918 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700919 struct rbd_req_coll *coll,
920 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700922 struct ceph_msg *msg),
923 struct ceph_osd_request **linger_req,
924 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925{
926 struct ceph_osd_request *req;
927 struct ceph_file_layout *layout;
928 int ret;
929 u64 bno;
930 struct timespec mtime = CURRENT_TIME;
931 struct rbd_request *req_data;
932 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600933 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700936 if (!req_data) {
937 if (coll)
938 rbd_coll_end_req_index(rq, coll, coll_index,
939 -ENOMEM, len);
940 return -ENOMEM;
941 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700943 if (coll) {
944 req_data->coll = coll;
945 req_data->coll_index = coll_index;
946 }
947
Alex Elderbd919d42012-07-13 20:35:11 -0500948 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
949 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950
Alex Elder0ce1a792012-07-03 16:01:18 -0500951 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600952 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
953 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700954 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700955 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 goto done_pages;
957 }
958
959 req->r_callback = rbd_cb;
960
961 req_data->rq = rq;
962 req_data->bio = bio;
963 req_data->pages = pages;
964 req_data->len = len;
965
966 req->r_priv = req_data;
967
968 reqhead = req->r_request->front.iov_base;
969 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
970
Alex Elderaded07e2012-07-03 16:01:18 -0500971 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700972 req->r_oid_len = strlen(req->r_oid);
973
974 layout = &req->r_file_layout;
975 memset(layout, 0, sizeof(*layout));
976 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
977 layout->fl_stripe_count = cpu_to_le32(1);
978 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500979 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600980 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
981 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982
983 ceph_osdc_build_request(req, ofs, &len,
984 ops,
985 snapc,
986 &mtime,
987 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700989 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600990 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700991 *linger_req = req;
992 }
993
Alex Elder1dbb4392012-01-24 10:08:37 -0600994 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 if (ret < 0)
996 goto done_err;
997
998 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600999 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001000 if (ver)
1001 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001002 dout("reassert_ver=%llu\n",
1003 (unsigned long long)
1004 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 ceph_osdc_put_request(req);
1006 }
1007 return ret;
1008
1009done_err:
1010 bio_chain_put(req_data->bio);
1011 ceph_osdc_put_request(req);
1012done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001013 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015 return ret;
1016}
1017
1018/*
1019 * Ceph osd op callback
1020 */
1021static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1022{
1023 struct rbd_request *req_data = req->r_priv;
1024 struct ceph_osd_reply_head *replyhead;
1025 struct ceph_osd_op *op;
1026 __s32 rc;
1027 u64 bytes;
1028 int read_op;
1029
1030 /* parse reply */
1031 replyhead = msg->front.iov_base;
1032 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1033 op = (void *)(replyhead + 1);
1034 rc = le32_to_cpu(replyhead->result);
1035 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001036 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037
Alex Elderbd919d42012-07-13 20:35:11 -05001038 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1039 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (rc == -ENOENT && read_op) {
1042 zero_bio_chain(req_data->bio, 0);
1043 rc = 0;
1044 } else if (rc == 0 && read_op && bytes < req_data->len) {
1045 zero_bio_chain(req_data->bio, bytes);
1046 bytes = req_data->len;
1047 }
1048
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001049 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001050
1051 if (req_data->bio)
1052 bio_chain_put(req_data->bio);
1053
1054 ceph_osdc_put_request(req);
1055 kfree(req_data);
1056}
1057
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001058static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1059{
1060 ceph_osdc_put_request(req);
1061}
1062
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063/*
1064 * Do a synchronous ceph osd operation
1065 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001066static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067 struct ceph_snap_context *snapc,
1068 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001069 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001070 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001071 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001073 char *buf,
1074 struct ceph_osd_request **linger_req,
1075 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076{
1077 int ret;
1078 struct page **pages;
1079 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001080
1081 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082
1083 num_pages = calc_pages_for(ofs , len);
1084 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001085 if (IS_ERR(pages))
1086 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087
Alex Elder0ce1a792012-07-03 16:01:18 -05001088 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001089 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090 pages, num_pages,
1091 flags,
1092 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001093 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001094 NULL,
1095 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001097 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098
1099 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1100 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1101
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102done:
1103 ceph_release_page_vector(pages, num_pages);
1104 return ret;
1105}
1106
1107/*
1108 * Do an asynchronous ceph osd operation
1109 */
1110static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001111 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112 struct ceph_snap_context *snapc,
1113 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001114 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001116 struct bio *bio,
1117 struct rbd_req_coll *coll,
1118 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001119{
1120 char *seg_name;
1121 u64 seg_ofs;
1122 u64 seg_len;
1123 int ret;
1124 struct ceph_osd_req_op *ops;
1125 u32 payload_len;
1126
1127 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1128 if (!seg_name)
1129 return -ENOMEM;
1130
1131 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001132 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133 ofs, len,
1134 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135
1136 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1137
Alex Elder57cfc102012-06-26 12:57:03 -07001138 ret = -ENOMEM;
1139 ops = rbd_create_rw_ops(1, opcode, payload_len);
1140 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141 goto done;
1142
1143 /* we've taken care of segment sizes earlier when we
1144 cloned the bios. We should never have a segment
1145 truncated at this point */
1146 BUG_ON(seg_len < len);
1147
1148 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1149 seg_name, seg_ofs, seg_len,
1150 bio,
1151 NULL, 0,
1152 flags,
1153 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001154 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001155 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001156
1157 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001158done:
1159 kfree(seg_name);
1160 return ret;
1161}
1162
1163/*
1164 * Request async osd write
1165 */
1166static int rbd_req_write(struct request *rq,
1167 struct rbd_device *rbd_dev,
1168 struct ceph_snap_context *snapc,
1169 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001170 struct bio *bio,
1171 struct rbd_req_coll *coll,
1172 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173{
1174 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1175 CEPH_OSD_OP_WRITE,
1176 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178}
1179
1180/*
1181 * Request async osd read
1182 */
1183static int rbd_req_read(struct request *rq,
1184 struct rbd_device *rbd_dev,
1185 u64 snapid,
1186 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001187 struct bio *bio,
1188 struct rbd_req_coll *coll,
1189 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190{
1191 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001192 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193 CEPH_OSD_OP_READ,
1194 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001195 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196}
1197
1198/*
1199 * Request sync osd read
1200 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001201static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001203 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001205 char *buf,
1206 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207{
Alex Elder913d2fd2012-06-26 12:57:03 -07001208 struct ceph_osd_req_op *ops;
1209 int ret;
1210
1211 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1212 if (!ops)
1213 return -ENOMEM;
1214
1215 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001216 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001218 ops, object_name, ofs, len, buf, NULL, ver);
1219 rbd_destroy_ops(ops);
1220
1221 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001222}
1223
1224/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001225 * Request sync osd watch
1226 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001227static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001229 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230{
1231 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001232 int ret;
1233
Alex Elder57cfc102012-06-26 12:57:03 -07001234 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1235 if (!ops)
1236 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237
Josh Durgina71b8912011-12-05 18:10:44 -08001238 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239 ops[0].watch.cookie = notify_id;
1240 ops[0].watch.flag = 0;
1241
Alex Elder0ce1a792012-07-03 16:01:18 -05001242 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001243 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001244 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001245 CEPH_OSD_FLAG_READ,
1246 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001247 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001248 rbd_simple_req_cb, 0, NULL);
1249
1250 rbd_destroy_ops(ops);
1251 return ret;
1252}
1253
1254static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1255{
Alex Elder0ce1a792012-07-03 16:01:18 -05001256 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001257 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001258 int rc;
1259
Alex Elder0ce1a792012-07-03 16:01:18 -05001260 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001261 return;
1262
Alex Elderbd919d42012-07-13 20:35:11 -05001263 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1264 rbd_dev->header_name, (unsigned long long) notify_id,
1265 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001266 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001267 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001268 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001269 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270
Alex Elder7f0a24d2012-07-25 09:32:40 -05001271 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272}
1273
1274/*
1275 * Request sync osd watch
1276 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001277static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278{
1279 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001280 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001281 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001282
Alex Elder57cfc102012-06-26 12:57:03 -07001283 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1284 if (!ops)
1285 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286
1287 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001288 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001289 if (ret < 0)
1290 goto fail;
1291
Alex Elder0e6f3222012-07-25 09:32:40 -05001292 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001293 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001294 ops[0].watch.flag = 1;
1295
Alex Elder0ce1a792012-07-03 16:01:18 -05001296 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001297 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001298 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1299 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001300 rbd_dev->header_name,
1301 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303
1304 if (ret < 0)
1305 goto fail_event;
1306
1307 rbd_destroy_ops(ops);
1308 return 0;
1309
1310fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001311 ceph_osdc_cancel_event(rbd_dev->watch_event);
1312 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001313fail:
1314 rbd_destroy_ops(ops);
1315 return ret;
1316}
1317
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001318/*
1319 * Request sync osd unwatch
1320 */
Alex Elder070c6332012-07-25 09:32:41 -05001321static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001322{
1323 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001324 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001325
Alex Elder57cfc102012-06-26 12:57:03 -07001326 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1327 if (!ops)
1328 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001329
1330 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001331 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001332 ops[0].watch.flag = 0;
1333
Alex Elder0ce1a792012-07-03 16:01:18 -05001334 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001335 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001338 rbd_dev->header_name,
1339 0, 0, NULL, NULL, NULL);
1340
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001341
1342 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001343 ceph_osdc_cancel_event(rbd_dev->watch_event);
1344 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001345 return ret;
1346}
1347
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001348struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001349 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001350};
1351
1352static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1353{
Alex Elder0ce1a792012-07-03 16:01:18 -05001354 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1355 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001356 return;
1357
Alex Elderbd919d42012-07-13 20:35:11 -05001358 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1359 rbd_dev->header_name, (unsigned long long) notify_id,
1360 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361}
1362
1363/*
1364 * Request sync osd notify
1365 */
Alex Elder4cb16252012-07-25 09:32:40 -05001366static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001367{
1368 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370 struct ceph_osd_event *event;
1371 struct rbd_notify_info info;
1372 int payload_len = sizeof(u32) + sizeof(u32);
1373 int ret;
1374
Alex Elder57cfc102012-06-26 12:57:03 -07001375 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1376 if (!ops)
1377 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001378
Alex Elder0ce1a792012-07-03 16:01:18 -05001379 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380
1381 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1382 (void *)&info, &event);
1383 if (ret < 0)
1384 goto fail;
1385
1386 ops[0].watch.ver = 1;
1387 ops[0].watch.flag = 1;
1388 ops[0].watch.cookie = event->cookie;
1389 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1390 ops[0].watch.timeout = 12;
1391
Alex Elder0ce1a792012-07-03 16:01:18 -05001392 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1395 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001396 rbd_dev->header_name,
1397 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001398 if (ret < 0)
1399 goto fail_event;
1400
1401 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1402 dout("ceph_osdc_wait_event returned %d\n", ret);
1403 rbd_destroy_ops(ops);
1404 return 0;
1405
1406fail_event:
1407 ceph_osdc_cancel_event(event);
1408fail:
1409 rbd_destroy_ops(ops);
1410 return ret;
1411}
1412
1413/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414 * Request sync osd read
1415 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001416static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001417 const char *object_name,
1418 const char *class_name,
1419 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001421 int len,
1422 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001423{
1424 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001425 int class_name_len = strlen(class_name);
1426 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001427 int ret;
1428
1429 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001430 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001431 if (!ops)
1432 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001433
Alex Elderaded07e2012-07-03 16:01:18 -05001434 ops[0].cls.class_name = class_name;
1435 ops[0].cls.class_len = (__u8) class_name_len;
1436 ops[0].cls.method_name = method_name;
1437 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001438 ops[0].cls.argc = 0;
1439 ops[0].cls.indata = data;
1440 ops[0].cls.indata_len = len;
1441
Alex Elder0ce1a792012-07-03 16:01:18 -05001442 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001443 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001444 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1445 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001446 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001447
1448 rbd_destroy_ops(ops);
1449
1450 dout("cls_exec returned %d\n", ret);
1451 return ret;
1452}
1453
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001454static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1455{
1456 struct rbd_req_coll *coll =
1457 kzalloc(sizeof(struct rbd_req_coll) +
1458 sizeof(struct rbd_req_status) * num_reqs,
1459 GFP_ATOMIC);
1460
1461 if (!coll)
1462 return NULL;
1463 coll->total = num_reqs;
1464 kref_init(&coll->kref);
1465 return coll;
1466}
1467
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468/*
1469 * block device queue callback
1470 */
1471static void rbd_rq_fn(struct request_queue *q)
1472{
1473 struct rbd_device *rbd_dev = q->queuedata;
1474 struct request *rq;
1475 struct bio_pair *bp = NULL;
1476
Alex Elder00f1f362012-02-07 12:03:36 -06001477 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001478 struct bio *bio;
1479 struct bio *rq_bio, *next_bio = NULL;
1480 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001481 unsigned int size;
1482 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001484 int num_segs, cur_seg = 0;
1485 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001486 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487
1488 /* peek at request from block layer */
1489 if (!rq)
1490 break;
1491
1492 dout("fetched request\n");
1493
1494 /* filter out block requests we don't understand */
1495 if ((rq->cmd_type != REQ_TYPE_FS)) {
1496 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001497 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001498 }
1499
1500 /* deduce our operation (read, write) */
1501 do_write = (rq_data_dir(rq) == WRITE);
1502
1503 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001504 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001505 rq_bio = rq->bio;
1506 if (do_write && rbd_dev->read_only) {
1507 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001508 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001509 }
1510
1511 spin_unlock_irq(q->queue_lock);
1512
Josh Durgind1d25642011-12-05 14:03:05 -08001513 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001514
Josh Durgind1d25642011-12-05 14:03:05 -08001515 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001516 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001517 dout("request for non-existent snapshot");
1518 spin_lock_irq(q->queue_lock);
1519 __blk_end_request_all(rq, -ENXIO);
1520 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001521 }
1522
Josh Durgind1d25642011-12-05 14:03:05 -08001523 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1524
1525 up_read(&rbd_dev->header_rwsem);
1526
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 dout("%s 0x%x bytes at 0x%llx\n",
1528 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001529 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001531 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1532 coll = rbd_alloc_coll(num_segs);
1533 if (!coll) {
1534 spin_lock_irq(q->queue_lock);
1535 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001536 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001537 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538 }
1539
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 do {
1541 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001542 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001544 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 ofs, size,
1546 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001547 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001548 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1549 op_size, GFP_ATOMIC);
1550 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001551 rbd_coll_end_req_index(rq, coll, cur_seg,
1552 -ENOMEM, op_size);
1553 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 }
1555
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001556
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 /* init OSD command: write or read */
1558 if (do_write)
1559 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001560 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001561 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001562 op_size, bio,
1563 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001564 else
1565 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001566 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001568 op_size, bio,
1569 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001570
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001571next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 size -= op_size;
1573 ofs += op_size;
1574
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001575 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001576 rq_bio = next_bio;
1577 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001578 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579
1580 if (bp)
1581 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001582 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001583
1584 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 }
1586}
1587
1588/*
1589 * a queue callback. Makes sure that we don't create a bio that spans across
1590 * multiple osd objects. One exception would be with a single page bios,
1591 * which we handle later at bio_chain_clone
1592 */
1593static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1594 struct bio_vec *bvec)
1595{
1596 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001597 unsigned int chunk_sectors;
1598 sector_t sector;
1599 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001600 int max;
1601
Alex Elder593a9e72012-02-07 12:03:37 -06001602 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1603 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1604 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1605
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001607 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 if (max < 0)
1609 max = 0; /* bio_add cannot handle a negative return */
1610 if (max <= bvec->bv_len && bio_sectors == 0)
1611 return bvec->bv_len;
1612 return max;
1613}
1614
1615static void rbd_free_disk(struct rbd_device *rbd_dev)
1616{
1617 struct gendisk *disk = rbd_dev->disk;
1618
1619 if (!disk)
1620 return;
1621
1622 rbd_header_free(&rbd_dev->header);
1623
1624 if (disk->flags & GENHD_FL_UP)
1625 del_gendisk(disk);
1626 if (disk->queue)
1627 blk_cleanup_queue(disk->queue);
1628 put_disk(disk);
1629}
1630
1631/*
1632 * reload the ondisk the header
1633 */
1634static int rbd_read_header(struct rbd_device *rbd_dev,
1635 struct rbd_image_header *header)
1636{
1637 ssize_t rc;
1638 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001639 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001640 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001641 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642
Alex Elder00f1f362012-02-07 12:03:36 -06001643 /*
1644 * First reads the fixed-size header to determine the number
1645 * of snapshots, then re-reads it, along with all snapshot
1646 * records as well as their stored names.
1647 */
1648 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001649 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 dh = kmalloc(len, GFP_KERNEL);
1651 if (!dh)
1652 return -ENOMEM;
1653
1654 rc = rbd_req_sync_read(rbd_dev,
Alex Elder9a5d6902012-07-19 09:09:27 -05001655 CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001656 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001657 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001658 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001659 if (rc < 0)
1660 goto out_dh;
1661
Alex Eldered63f4f2012-07-19 09:09:27 -05001662 rc = rbd_header_from_disk(header, dh, snap_count);
Josh Durgin81e759f2011-11-15 14:49:53 -08001663 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001664 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001665 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001666 " for image %s\n",
1667 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001669 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001670
Alex Elder00f1f362012-02-07 12:03:36 -06001671 if (snap_count == header->total_snaps)
1672 break;
1673
1674 snap_count = header->total_snaps;
1675 len = sizeof (*dh) +
1676 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1677 header->snap_names_len;
1678
1679 rbd_header_free(header);
1680 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001681 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001682 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001683
1684out_dh:
1685 kfree(dh);
1686 return rc;
1687}
1688
1689/*
1690 * create a snapshot
1691 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001692static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001693 const char *snap_name,
1694 gfp_t gfp_flags)
1695{
1696 int name_len = strlen(snap_name);
1697 u64 new_snapid;
1698 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001699 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001700 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001701
1702 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001703 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704 return -EINVAL;
1705
Alex Elder0ce1a792012-07-03 16:01:18 -05001706 monc = &rbd_dev->rbd_client->client->monc;
1707 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001708 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001709 if (ret < 0)
1710 return ret;
1711
1712 data = kmalloc(name_len + 16, gfp_flags);
1713 if (!data)
1714 return -ENOMEM;
1715
Sage Weil916d4d62011-05-12 16:10:50 -07001716 p = data;
1717 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718
Sage Weil916d4d62011-05-12 16:10:50 -07001719 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1720 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721
Alex Elder0bed54d2012-07-03 16:01:18 -05001722 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001723 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001724 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001725
Sage Weil916d4d62011-05-12 16:10:50 -07001726 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727
Alex Elder505cbb92012-07-19 08:49:18 -05001728 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001729bad:
1730 return -ERANGE;
1731}
1732
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001733static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1734{
1735 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001736 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001737
Alex Eldera0593292012-07-19 09:09:27 -05001738 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001739 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001740}
1741
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742/*
1743 * only read the first part of the ondisk header, without the snaps info
1744 */
Alex Elderb8136232012-07-25 09:32:41 -05001745static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746{
1747 int ret;
1748 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001749
1750 ret = rbd_read_header(rbd_dev, &h);
1751 if (ret < 0)
1752 return ret;
1753
Josh Durgina51aa0c2011-12-05 10:35:04 -08001754 down_write(&rbd_dev->header_rwsem);
1755
Sage Weil9db4b3e2011-04-19 22:49:06 -07001756 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001757 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1758 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1759
1760 dout("setting size to %llu sectors", (unsigned long long) size);
1761 set_capacity(rbd_dev->disk, size);
1762 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001763
Alex Elder849b4262012-07-09 21:04:24 -05001764 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001765 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001766 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001767 /* osd requests may still refer to snapc */
1768 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001769
Alex Elderb8136232012-07-25 09:32:41 -05001770 if (hver)
1771 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001772 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001773 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774 rbd_dev->header.total_snaps = h.total_snaps;
1775 rbd_dev->header.snapc = h.snapc;
1776 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001777 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001778 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001779 /* Free the extra copy of the object prefix */
1780 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1781 kfree(h.object_prefix);
1782
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001783 ret = __rbd_init_snaps_header(rbd_dev);
1784
Josh Durginc6666012011-11-21 17:11:12 -08001785 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001787 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001788}
1789
Alex Elder1fe5e992012-07-25 09:32:41 -05001790static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1791{
1792 int ret;
1793
1794 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1795 ret = __rbd_refresh_header(rbd_dev, hver);
1796 mutex_unlock(&ctl_mutex);
1797
1798 return ret;
1799}
1800
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001801static int rbd_init_disk(struct rbd_device *rbd_dev)
1802{
1803 struct gendisk *disk;
1804 struct request_queue *q;
1805 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001806 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001807 u64 total_size = 0;
1808
1809 /* contact OSD, request size info about the object being mapped */
1810 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1811 if (rc)
1812 return rc;
1813
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001814 /* no need to lock here, as rbd_dev is not registered yet */
1815 rc = __rbd_init_snaps_header(rbd_dev);
1816 if (rc)
1817 return rc;
1818
Josh Durgincc9d7342011-11-21 18:19:13 -08001819 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820 if (rc)
1821 return rc;
1822
1823 /* create gendisk info */
1824 rc = -ENOMEM;
1825 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1826 if (!disk)
1827 goto out;
1828
Alex Elderf0f8cef2012-01-29 13:57:44 -06001829 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001830 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001831 disk->major = rbd_dev->major;
1832 disk->first_minor = 0;
1833 disk->fops = &rbd_bd_ops;
1834 disk->private_data = rbd_dev;
1835
1836 /* init rq */
1837 rc = -ENOMEM;
1838 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1839 if (!q)
1840 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001841
Alex Elder593a9e72012-02-07 12:03:37 -06001842 /* We use the default size, but let's be explicit about it. */
1843 blk_queue_physical_block_size(q, SECTOR_SIZE);
1844
Josh Durgin029bcbd2011-07-22 11:35:23 -07001845 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001846 segment_size = rbd_obj_bytes(&rbd_dev->header);
1847 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1848 blk_queue_max_segment_size(q, segment_size);
1849 blk_queue_io_min(q, segment_size);
1850 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001851
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001852 blk_queue_merge_bvec(q, rbd_merge_bvec);
1853 disk->queue = q;
1854
1855 q->queuedata = rbd_dev;
1856
1857 rbd_dev->disk = disk;
1858 rbd_dev->q = q;
1859
1860 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001861 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001862 add_disk(disk);
1863
1864 pr_info("%s: added with size 0x%llx\n",
1865 disk->disk_name, (unsigned long long)total_size);
1866 return 0;
1867
1868out_disk:
1869 put_disk(disk);
1870out:
1871 return rc;
1872}
1873
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001874/*
1875 sysfs
1876*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877
Alex Elder593a9e72012-02-07 12:03:37 -06001878static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1879{
1880 return container_of(dev, struct rbd_device, dev);
1881}
1882
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001883static ssize_t rbd_size_show(struct device *dev,
1884 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885{
Alex Elder593a9e72012-02-07 12:03:37 -06001886 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001887 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001888
Josh Durgina51aa0c2011-12-05 10:35:04 -08001889 down_read(&rbd_dev->header_rwsem);
1890 size = get_capacity(rbd_dev->disk);
1891 up_read(&rbd_dev->header_rwsem);
1892
1893 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001894}
1895
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001896static ssize_t rbd_major_show(struct device *dev,
1897 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898{
Alex Elder593a9e72012-02-07 12:03:37 -06001899 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001900
1901 return sprintf(buf, "%d\n", rbd_dev->major);
1902}
1903
1904static ssize_t rbd_client_id_show(struct device *dev,
1905 struct device_attribute *attr, char *buf)
1906{
Alex Elder593a9e72012-02-07 12:03:37 -06001907 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908
Alex Elder1dbb4392012-01-24 10:08:37 -06001909 return sprintf(buf, "client%lld\n",
1910 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001911}
1912
1913static ssize_t rbd_pool_show(struct device *dev,
1914 struct device_attribute *attr, char *buf)
1915{
Alex Elder593a9e72012-02-07 12:03:37 -06001916 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917
1918 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1919}
1920
Alex Elder9bb2f332012-07-12 10:46:35 -05001921static ssize_t rbd_pool_id_show(struct device *dev,
1922 struct device_attribute *attr, char *buf)
1923{
1924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1925
1926 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1927}
1928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929static ssize_t rbd_name_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
1931{
Alex Elder593a9e72012-02-07 12:03:37 -06001932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001933
Alex Elder0bed54d2012-07-03 16:01:18 -05001934 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001935}
1936
1937static ssize_t rbd_snap_show(struct device *dev,
1938 struct device_attribute *attr,
1939 char *buf)
1940{
Alex Elder593a9e72012-02-07 12:03:37 -06001941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942
1943 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1944}
1945
1946static ssize_t rbd_image_refresh(struct device *dev,
1947 struct device_attribute *attr,
1948 const char *buf,
1949 size_t size)
1950{
Alex Elder593a9e72012-02-07 12:03:37 -06001951 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001952 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001953
Alex Elder1fe5e992012-07-25 09:32:41 -05001954 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001955
1956 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001958
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1960static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1961static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1962static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001963static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001964static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1965static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1966static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1967static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968
1969static struct attribute *rbd_attrs[] = {
1970 &dev_attr_size.attr,
1971 &dev_attr_major.attr,
1972 &dev_attr_client_id.attr,
1973 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001974 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975 &dev_attr_name.attr,
1976 &dev_attr_current_snap.attr,
1977 &dev_attr_refresh.attr,
1978 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001979 NULL
1980};
1981
1982static struct attribute_group rbd_attr_group = {
1983 .attrs = rbd_attrs,
1984};
1985
1986static const struct attribute_group *rbd_attr_groups[] = {
1987 &rbd_attr_group,
1988 NULL
1989};
1990
1991static void rbd_sysfs_dev_release(struct device *dev)
1992{
1993}
1994
1995static struct device_type rbd_device_type = {
1996 .name = "rbd",
1997 .groups = rbd_attr_groups,
1998 .release = rbd_sysfs_dev_release,
1999};
2000
2001
2002/*
2003 sysfs - snapshots
2004*/
2005
2006static ssize_t rbd_snap_size_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
2010 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2011
Josh Durgin35915382011-12-05 18:25:13 -08002012 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002013}
2014
2015static ssize_t rbd_snap_id_show(struct device *dev,
2016 struct device_attribute *attr,
2017 char *buf)
2018{
2019 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2020
Josh Durgin35915382011-12-05 18:25:13 -08002021 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022}
2023
2024static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2025static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2026
2027static struct attribute *rbd_snap_attrs[] = {
2028 &dev_attr_snap_size.attr,
2029 &dev_attr_snap_id.attr,
2030 NULL,
2031};
2032
2033static struct attribute_group rbd_snap_attr_group = {
2034 .attrs = rbd_snap_attrs,
2035};
2036
2037static void rbd_snap_dev_release(struct device *dev)
2038{
2039 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2040 kfree(snap->name);
2041 kfree(snap);
2042}
2043
2044static const struct attribute_group *rbd_snap_attr_groups[] = {
2045 &rbd_snap_attr_group,
2046 NULL
2047};
2048
2049static struct device_type rbd_snap_device_type = {
2050 .groups = rbd_snap_attr_groups,
2051 .release = rbd_snap_dev_release,
2052};
2053
Alex Elder14e70852012-07-19 09:09:27 -05002054static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002055{
2056 list_del(&snap->node);
2057 device_unregister(&snap->dev);
2058}
2059
Alex Elder14e70852012-07-19 09:09:27 -05002060static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061 struct device *parent)
2062{
2063 struct device *dev = &snap->dev;
2064 int ret;
2065
2066 dev->type = &rbd_snap_device_type;
2067 dev->parent = parent;
2068 dev->release = rbd_snap_dev_release;
2069 dev_set_name(dev, "snap_%s", snap->name);
2070 ret = device_register(dev);
2071
2072 return ret;
2073}
2074
Alex Elder4e891e02012-07-10 20:30:10 -05002075static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2076 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077{
Alex Elder4e891e02012-07-10 20:30:10 -05002078 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002080
2081 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002082 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002083 return ERR_PTR(-ENOMEM);
2084
2085 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002087 if (!snap->name)
2088 goto err;
2089
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002090 snap->size = rbd_dev->header.snap_sizes[i];
2091 snap->id = rbd_dev->header.snapc->snaps[i];
2092 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002093 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094 if (ret < 0)
2095 goto err;
2096 }
Alex Elder4e891e02012-07-10 20:30:10 -05002097
2098 return snap;
2099
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100err:
2101 kfree(snap->name);
2102 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002103
2104 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105}
2106
2107/*
Alex Elder35938152012-08-02 11:29:46 -05002108 * Scan the rbd device's current snapshot list and compare it to the
2109 * newly-received snapshot context. Remove any existing snapshots
2110 * not present in the new snapshot context. Add a new snapshot for
2111 * any snaphots in the snapshot context not in the current list.
2112 * And verify there are no changes to snapshots we already know
2113 * about.
2114 *
2115 * Assumes the snapshots in the snapshot context are sorted by
2116 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2117 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002118 */
2119static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2120{
Alex Elder35938152012-08-02 11:29:46 -05002121 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2122 const u32 snap_count = snapc->num_snaps;
2123 char *snap_name = rbd_dev->header.snap_names;
2124 struct list_head *head = &rbd_dev->snaps;
2125 struct list_head *links = head->next;
2126 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127
Alex Elder35938152012-08-02 11:29:46 -05002128 while (index < snap_count || links != head) {
2129 u64 snap_id;
2130 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131
Alex Elder35938152012-08-02 11:29:46 -05002132 snap_id = index < snap_count ? snapc->snaps[index]
2133 : CEPH_NOSNAP;
2134 snap = links != head ? list_entry(links, struct rbd_snap, node)
2135 : NULL;
2136 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002137
Alex Elder35938152012-08-02 11:29:46 -05002138 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2139 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002140
Alex Elder35938152012-08-02 11:29:46 -05002141 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142
Alex Elder35938152012-08-02 11:29:46 -05002143 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002144 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002145 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002146
Alex Elder35938152012-08-02 11:29:46 -05002147 /* Done with this list entry; advance */
2148
2149 links = next;
2150 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151 }
Alex Elder35938152012-08-02 11:29:46 -05002152
2153 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2154 struct rbd_snap *new_snap;
2155
2156 /* We haven't seen this snapshot before */
2157
2158 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2159 snap_name);
2160 if (IS_ERR(new_snap))
2161 return PTR_ERR(new_snap);
2162
2163 /* New goes before existing, or at end of list */
2164
2165 if (snap)
2166 list_add_tail(&new_snap->node, &snap->node);
2167 else
2168 list_add(&new_snap->node, head);
2169 } else {
2170 /* Already have this one */
2171
2172 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2173 BUG_ON(strcmp(snap->name, snap_name));
2174
2175 /* Done with this list entry; advance */
2176
2177 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178 }
Alex Elder35938152012-08-02 11:29:46 -05002179
2180 /* Advance to the next entry in the snapshot context */
2181
2182 index++;
2183 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184 }
2185
2186 return 0;
2187}
2188
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002189static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2190{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002191 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002192 struct device *dev;
2193 struct rbd_snap *snap;
2194
2195 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2196 dev = &rbd_dev->dev;
2197
2198 dev->bus = &rbd_bus_type;
2199 dev->type = &rbd_device_type;
2200 dev->parent = &rbd_root_dev;
2201 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002202 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002203 ret = device_register(dev);
2204 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002205 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002206
2207 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002208 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002209 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002210 break;
2211 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002212out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002213 mutex_unlock(&ctl_mutex);
2214 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002215}
2216
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002217static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2218{
2219 device_unregister(&rbd_dev->dev);
2220}
2221
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002222static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2223{
2224 int ret, rc;
2225
2226 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002227 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002228 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002229 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002230 if (rc < 0)
2231 return rc;
2232 }
2233 } while (ret == -ERANGE);
2234
2235 return ret;
2236}
2237
Alex Elder1ddbe942012-01-29 13:57:44 -06002238static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2239
2240/*
Alex Elder499afd52012-02-02 08:13:29 -06002241 * Get a unique rbd identifier for the given new rbd_dev, and add
2242 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002243 */
Alex Elder499afd52012-02-02 08:13:29 -06002244static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002245{
Alex Elderde71a292012-07-03 16:01:19 -05002246 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002247
2248 spin_lock(&rbd_dev_list_lock);
2249 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2250 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002251}
Alex Elderb7f23c32012-01-29 13:57:43 -06002252
Alex Elder1ddbe942012-01-29 13:57:44 -06002253/*
Alex Elder499afd52012-02-02 08:13:29 -06002254 * Remove an rbd_dev from the global list, and record that its
2255 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002256 */
Alex Elder499afd52012-02-02 08:13:29 -06002257static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002258{
Alex Elderd184f6b2012-01-29 13:57:44 -06002259 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002260 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002261 int max_id;
2262
2263 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002264
2265 spin_lock(&rbd_dev_list_lock);
2266 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002267
2268 /*
2269 * If the id being "put" is not the current maximum, there
2270 * is nothing special we need to do.
2271 */
2272 if (rbd_id != atomic64_read(&rbd_id_max)) {
2273 spin_unlock(&rbd_dev_list_lock);
2274 return;
2275 }
2276
2277 /*
2278 * We need to update the current maximum id. Search the
2279 * list to find out what it is. We're more likely to find
2280 * the maximum at the end, so search the list backward.
2281 */
2282 max_id = 0;
2283 list_for_each_prev(tmp, &rbd_dev_list) {
2284 struct rbd_device *rbd_dev;
2285
2286 rbd_dev = list_entry(tmp, struct rbd_device, node);
2287 if (rbd_id > max_id)
2288 max_id = rbd_id;
2289 }
Alex Elder499afd52012-02-02 08:13:29 -06002290 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002291
Alex Elder1ddbe942012-01-29 13:57:44 -06002292 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002293 * The max id could have been updated by rbd_id_get(), in
2294 * which case it now accurately reflects the new maximum.
2295 * Be careful not to overwrite the maximum value in that
2296 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002297 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002298 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002299}
2300
Alex Eldera725f65e2012-02-02 08:13:30 -06002301/*
Alex Eldere28fff262012-02-02 08:13:30 -06002302 * Skips over white space at *buf, and updates *buf to point to the
2303 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002304 * the token (string of non-white space characters) found. Note
2305 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002306 */
2307static inline size_t next_token(const char **buf)
2308{
2309 /*
2310 * These are the characters that produce nonzero for
2311 * isspace() in the "C" and "POSIX" locales.
2312 */
2313 const char *spaces = " \f\n\r\t\v";
2314
2315 *buf += strspn(*buf, spaces); /* Find start of token */
2316
2317 return strcspn(*buf, spaces); /* Return token length */
2318}
2319
2320/*
2321 * Finds the next token in *buf, and if the provided token buffer is
2322 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002323 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2324 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002325 *
2326 * Returns the length of the token found (not including the '\0').
2327 * Return value will be 0 if no token is found, and it will be >=
2328 * token_size if the token would not fit.
2329 *
Alex Elder593a9e72012-02-07 12:03:37 -06002330 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002331 * found token. Note that this occurs even if the token buffer is
2332 * too small to hold it.
2333 */
2334static inline size_t copy_token(const char **buf,
2335 char *token,
2336 size_t token_size)
2337{
2338 size_t len;
2339
2340 len = next_token(buf);
2341 if (len < token_size) {
2342 memcpy(token, *buf, len);
2343 *(token + len) = '\0';
2344 }
2345 *buf += len;
2346
2347 return len;
2348}
2349
2350/*
Alex Elderea3352f2012-07-09 21:04:23 -05002351 * Finds the next token in *buf, dynamically allocates a buffer big
2352 * enough to hold a copy of it, and copies the token into the new
2353 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2354 * that a duplicate buffer is created even for a zero-length token.
2355 *
2356 * Returns a pointer to the newly-allocated duplicate, or a null
2357 * pointer if memory for the duplicate was not available. If
2358 * the lenp argument is a non-null pointer, the length of the token
2359 * (not including the '\0') is returned in *lenp.
2360 *
2361 * If successful, the *buf pointer will be updated to point beyond
2362 * the end of the found token.
2363 *
2364 * Note: uses GFP_KERNEL for allocation.
2365 */
2366static inline char *dup_token(const char **buf, size_t *lenp)
2367{
2368 char *dup;
2369 size_t len;
2370
2371 len = next_token(buf);
2372 dup = kmalloc(len + 1, GFP_KERNEL);
2373 if (!dup)
2374 return NULL;
2375
2376 memcpy(dup, *buf, len);
2377 *(dup + len) = '\0';
2378 *buf += len;
2379
2380 if (lenp)
2381 *lenp = len;
2382
2383 return dup;
2384}
2385
2386/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002387 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002388 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2389 * on the list of monitor addresses and other options provided via
2390 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002391 *
2392 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002393 */
2394static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2395 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002396 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002397 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002398 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002399 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002400{
Alex Elderd22f76e2012-07-12 10:46:35 -05002401 size_t len;
2402 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002403
2404 /* The first four tokens are required */
2405
Alex Elder7ef32142012-02-02 08:13:30 -06002406 len = next_token(&buf);
2407 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002408 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002409 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002410 *mon_addrs = buf;
2411
2412 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002413
Alex Eldere28fff262012-02-02 08:13:30 -06002414 len = copy_token(&buf, options, options_size);
2415 if (!len || len >= options_size)
2416 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002417
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002418 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002419 rbd_dev->pool_name = dup_token(&buf, NULL);
2420 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002421 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002422
Alex Elder0bed54d2012-07-03 16:01:18 -05002423 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2424 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002425 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002426
Alex Eldercb8627c2012-07-09 21:04:23 -05002427 /* Create the name of the header object */
2428
Alex Elder0bed54d2012-07-03 16:01:18 -05002429 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002430 + sizeof (RBD_SUFFIX),
2431 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002432 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002433 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002434 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002435
Alex Eldere28fff262012-02-02 08:13:30 -06002436 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002437 * The snapshot name is optional. If none is is supplied,
2438 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002439 */
Alex Elder820a5f32012-07-09 21:04:24 -05002440 rbd_dev->snap_name = dup_token(&buf, &len);
2441 if (!rbd_dev->snap_name)
2442 goto out_err;
2443 if (!len) {
2444 /* Replace the empty name with the default */
2445 kfree(rbd_dev->snap_name);
2446 rbd_dev->snap_name
2447 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2448 if (!rbd_dev->snap_name)
2449 goto out_err;
2450
Alex Eldere28fff262012-02-02 08:13:30 -06002451 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2452 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002453 }
Alex Eldere28fff262012-02-02 08:13:30 -06002454
Alex Eldera725f65e2012-02-02 08:13:30 -06002455 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002456
2457out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002458 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002459 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002460 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002461 rbd_dev->image_name = NULL;
2462 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002463 kfree(rbd_dev->pool_name);
2464 rbd_dev->pool_name = NULL;
2465
2466 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002467}
2468
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002469static ssize_t rbd_add(struct bus_type *bus,
2470 const char *buf,
2471 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002472{
Alex Eldercb8627c2012-07-09 21:04:23 -05002473 char *options;
2474 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002475 const char *mon_addrs = NULL;
2476 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002477 struct ceph_osd_client *osdc;
2478 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002479
2480 if (!try_module_get(THIS_MODULE))
2481 return -ENODEV;
2482
Alex Elder27cc2592012-02-02 08:13:30 -06002483 options = kmalloc(count, GFP_KERNEL);
2484 if (!options)
2485 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002486 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2487 if (!rbd_dev)
2488 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002489
2490 /* static rbd_device initialization */
2491 spin_lock_init(&rbd_dev->lock);
2492 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002493 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002494 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002495
Alex Elderd184f6b2012-01-29 13:57:44 -06002496 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002497 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002498
Alex Eldera725f65e2012-02-02 08:13:30 -06002499 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002500 BUILD_BUG_ON(DEV_NAME_LEN
2501 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002502 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002503
Alex Eldera725f65e2012-02-02 08:13:30 -06002504 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002505 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002506 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002507 if (rc)
2508 goto err_put_id;
2509
Alex Elder5214ecc2012-02-02 08:13:30 -06002510 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2511 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002512 if (IS_ERR(rbd_dev->rbd_client)) {
2513 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002514 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002515 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002516 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002517
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002518 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002519 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002520 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2521 if (rc < 0)
2522 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002523 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002524
2525 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002526 rc = register_blkdev(0, rbd_dev->name);
2527 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002528 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002529 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002531 rc = rbd_bus_add_dev(rbd_dev);
2532 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002533 goto err_out_blkdev;
2534
Alex Elder32eec682012-02-08 16:11:14 -06002535 /*
2536 * At this point cleanup in the event of an error is the job
2537 * of the sysfs code (initiated by rbd_bus_del_dev()).
2538 *
2539 * Set up and announce blkdev mapping.
2540 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002541 rc = rbd_init_disk(rbd_dev);
2542 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002543 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002544
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002545 rc = rbd_init_watch_dev(rbd_dev);
2546 if (rc)
2547 goto err_out_bus;
2548
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549 return count;
2550
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002551err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002552 /* this will also clean up rest of rbd_dev stuff */
2553
2554 rbd_bus_del_dev(rbd_dev);
2555 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002556 return rc;
2557
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002558err_out_blkdev:
2559 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2560err_out_client:
2561 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002562err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002563 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002564 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002565 kfree(rbd_dev->header_name);
2566 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002567 kfree(rbd_dev->pool_name);
2568 }
Alex Elder499afd52012-02-02 08:13:29 -06002569 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002570err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002571 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002572 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002573
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574 dout("Error adding device %s\n", buf);
2575 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002576
2577 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578}
2579
Alex Elderde71a292012-07-03 16:01:19 -05002580static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002581{
2582 struct list_head *tmp;
2583 struct rbd_device *rbd_dev;
2584
Alex Eldere124a822012-01-29 13:57:44 -06002585 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002586 list_for_each(tmp, &rbd_dev_list) {
2587 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002588 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002589 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002590 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002591 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002592 }
Alex Eldere124a822012-01-29 13:57:44 -06002593 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002594 return NULL;
2595}
2596
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002597static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002598{
Alex Elder593a9e72012-02-07 12:03:37 -06002599 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002600
Alex Elder1dbb4392012-01-24 10:08:37 -06002601 if (rbd_dev->watch_request) {
2602 struct ceph_client *client = rbd_dev->rbd_client->client;
2603
2604 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002605 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002606 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002607 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002608 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002609
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002610 rbd_put_client(rbd_dev);
2611
2612 /* clean up and free blkdev */
2613 rbd_free_disk(rbd_dev);
2614 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002615
2616 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002617 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002618 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002619 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002620 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002621 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002622 kfree(rbd_dev);
2623
2624 /* release module ref */
2625 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002626}
2627
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002628static ssize_t rbd_remove(struct bus_type *bus,
2629 const char *buf,
2630 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002631{
2632 struct rbd_device *rbd_dev = NULL;
2633 int target_id, rc;
2634 unsigned long ul;
2635 int ret = count;
2636
2637 rc = strict_strtoul(buf, 10, &ul);
2638 if (rc)
2639 return rc;
2640
2641 /* convert to int; abort if we lost anything in the conversion */
2642 target_id = (int) ul;
2643 if (target_id != ul)
2644 return -EINVAL;
2645
2646 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2647
2648 rbd_dev = __rbd_get_dev(target_id);
2649 if (!rbd_dev) {
2650 ret = -ENOENT;
2651 goto done;
2652 }
2653
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002654 __rbd_remove_all_snaps(rbd_dev);
2655 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656
2657done:
2658 mutex_unlock(&ctl_mutex);
2659 return ret;
2660}
2661
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002662static ssize_t rbd_snap_add(struct device *dev,
2663 struct device_attribute *attr,
2664 const char *buf,
2665 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666{
Alex Elder593a9e72012-02-07 12:03:37 -06002667 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002668 int ret;
2669 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002670 if (!name)
2671 return -ENOMEM;
2672
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002673 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002674
2675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2676
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002677 ret = rbd_header_add_snap(rbd_dev,
2678 name, GFP_KERNEL);
2679 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002680 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681
Alex Elderb8136232012-07-25 09:32:41 -05002682 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002683 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002684 goto err_unlock;
2685
2686 /* shouldn't hold ctl_mutex when notifying.. notify might
2687 trigger a watch callback that would need to get that mutex */
2688 mutex_unlock(&ctl_mutex);
2689
2690 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002691 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692
2693 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002694 kfree(name);
2695 return ret;
2696
2697err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002698 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699 kfree(name);
2700 return ret;
2701}
2702
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703/*
2704 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002705 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002706 */
2707static int rbd_sysfs_init(void)
2708{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002709 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710
Alex Elderfed4c142012-02-07 12:03:36 -06002711 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002712 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002713 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002714
Alex Elderfed4c142012-02-07 12:03:36 -06002715 ret = bus_register(&rbd_bus_type);
2716 if (ret < 0)
2717 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002718
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002719 return ret;
2720}
2721
2722static void rbd_sysfs_cleanup(void)
2723{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002724 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002725 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002726}
2727
2728int __init rbd_init(void)
2729{
2730 int rc;
2731
2732 rc = rbd_sysfs_init();
2733 if (rc)
2734 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002735 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002736 return 0;
2737}
2738
2739void __exit rbd_exit(void)
2740{
2741 rbd_sysfs_cleanup();
2742}
2743
2744module_init(rbd_init);
2745module_exit(rbd_exit);
2746
2747MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2748MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2749MODULE_DESCRIPTION("rados block device");
2750
2751/* following authorship retained from original osdblk.c */
2752MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2753
2754MODULE_LICENSE("GPL");