blob: c9de0f8e808e3a718862dca1bad066f93a5609f3 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d862012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d862012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
325 * Find a ceph client with specific addr and configuration.
326 */
Alex Elder43ae4702012-07-03 16:01:18 -0500327static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328{
329 struct rbd_client *client_node;
330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 return NULL;
333
334 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500335 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 return client_node;
337 return NULL;
338}
339
340/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700341 * mount options
342 */
343enum {
344 Opt_notify_timeout,
345 Opt_last_int,
346 /* int args above */
347 Opt_last_string,
348 /* string args above */
349};
350
Alex Elder43ae4702012-07-03 16:01:18 -0500351static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700352 {Opt_notify_timeout, "notify_timeout=%d"},
353 /* int args above */
354 /* string args above */
355 {-1, NULL}
356};
357
358static int parse_rbd_opts_token(char *c, void *private)
359{
Alex Elder43ae4702012-07-03 16:01:18 -0500360 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 substring_t argstr[MAX_OPT_ARGS];
362 int token, intval, ret;
363
Alex Elder43ae4702012-07-03 16:01:18 -0500364 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700365 if (token < 0)
366 return -EINVAL;
367
368 if (token < Opt_last_int) {
369 ret = match_int(&argstr[0], &intval);
370 if (ret < 0) {
371 pr_err("bad mount option arg (not int) "
372 "at '%s'\n", c);
373 return ret;
374 }
375 dout("got int token %d val %d\n", token, intval);
376 } else if (token > Opt_last_int && token < Opt_last_string) {
377 dout("got string token %d val %s\n", token,
378 argstr[0].from);
379 } else {
380 dout("got token %d\n", token);
381 }
382
383 switch (token) {
384 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500385 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386 break;
387 default:
388 BUG_ON(token);
389 }
390 return 0;
391}
392
393/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394 * Get a ceph client with specific addr and configuration, if one does
395 * not exist create it.
396 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600397static struct rbd_client *rbd_get_client(const char *mon_addr,
398 size_t mon_addr_len,
399 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400{
401 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500402 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403 struct rbd_options *rbd_opts;
404
405 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
406 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600407 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408
409 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elder43ae4702012-07-03 16:01:18 -0500411 ceph_opts = ceph_parse_options(options, mon_addr,
412 mon_addr + mon_addr_len,
413 parse_rbd_opts_token, rbd_opts);
414 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600415 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500416 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600417 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500420 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600422 /* using an existing client */
423 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600424 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d32012-01-29 13:57:44 -0600425
Alex Elder43ae4702012-07-03 16:01:18 -0500426 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600427 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700428
Alex Elderd720bcb2012-02-02 08:13:30 -0600429 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430 }
Alex Elder432b8582012-01-29 13:57:44 -0600431 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elder43ae4702012-07-03 16:01:18 -0500433 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600434
Alex Elderd720bcb2012-02-02 08:13:30 -0600435 if (IS_ERR(rbdc))
436 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439}
440
441/*
442 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600443 *
Alex Elder432b8582012-01-29 13:57:44 -0600444 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 */
446static void rbd_client_release(struct kref *kref)
447{
448 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
449
450 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500451 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454
455 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700456 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 kfree(rbdc);
458}
459
460/*
461 * Drop reference to ceph client node. If it's not referenced anymore, release
462 * it.
463 */
464static void rbd_put_client(struct rbd_device *rbd_dev)
465{
466 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
467 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468}
469
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700470/*
471 * Destroy requests collection
472 */
473static void rbd_coll_release(struct kref *kref)
474{
475 struct rbd_req_coll *coll =
476 container_of(kref, struct rbd_req_coll, kref);
477
478 dout("rbd_coll_release %p\n", coll);
479 kfree(coll);
480}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481
Alex Elder8e94af82012-07-25 09:32:40 -0500482static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
483{
484 return !memcmp(&ondisk->text,
485 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
486}
487
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488/*
489 * Create a new header structure, translate header format from the on-disk
490 * header.
491 */
492static int rbd_header_from_disk(struct rbd_image_header *header,
493 struct rbd_image_header_ondisk *ondisk,
Alex Eldered63f4f2012-07-19 09:09:27 -0500494 u32 allocated_snaps)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495{
Alex Elderccece232012-07-10 20:30:10 -0500496 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500497 size_t size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498
Alex Elder8e94af82012-07-25 09:32:40 -0500499 if (!rbd_dev_ondisk_valid(ondisk))
Josh Durgin81e759f2011-11-15 14:49:53 -0800500 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800501
Alex Elder00f1f362012-02-07 12:03:36 -0600502 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderd2bb24e2012-07-26 23:37:14 -0500503
504 /* Make sure we don't overflow below */
505 size = SIZE_MAX - sizeof (struct ceph_snap_context);
506 if (snap_count > size / sizeof (header->snapc->snaps[0]))
Xi Wang50f7c4c2012-04-20 15:49:44 -0500507 return -EINVAL;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500508
Alex Elder6a523252012-07-19 17:12:59 -0500509 memset(header, 0, sizeof (*header));
510
511 size = sizeof (ondisk->block_name) + 1;
512 header->object_prefix = kmalloc(size, GFP_KERNEL);
513 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514 return -ENOMEM;
Alex Elder6a523252012-07-19 17:12:59 -0500515 memcpy(header->object_prefix, ondisk->block_name, size - 1);
516 header->object_prefix[size - 1] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500519 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500520 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500522 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500524 goto out_err;
525
Alex Elderd2bb24e2012-07-26 23:37:14 -0500526 size = snap_count * sizeof (*header->snap_sizes);
527 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700528 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500529 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530 } else {
Alex Elderccece232012-07-10 20:30:10 -0500531 WARN_ON(ondisk->snap_names_len);
532 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533 header->snap_names = NULL;
534 header->snap_sizes = NULL;
535 }
Alex Elder849b4262012-07-09 21:04:24 -0500536
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700537 header->image_size = le64_to_cpu(ondisk->image_size);
538 header->obj_order = ondisk->options.order;
539 header->crypt_type = ondisk->options.crypt_type;
540 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500541 header->total_snaps = snap_count;
542
543 /* Set up the snapshot context */
544
545 size = sizeof (struct ceph_snap_context);
546 size += snap_count * sizeof (header->snapc->snaps[0]);
547 header->snapc = kzalloc(size, GFP_KERNEL);
548 if (!header->snapc)
549 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550
551 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500552 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700553 header->snapc->num_snaps = snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700554
Alex Elder21079782012-01-24 10:08:36 -0600555 if (snap_count && allocated_snaps == snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500556 int i;
557
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700558 for (i = 0; i < snap_count; i++) {
559 header->snapc->snaps[i] =
560 le64_to_cpu(ondisk->snaps[i].id);
561 header->snap_sizes[i] =
562 le64_to_cpu(ondisk->snaps[i].image_size);
563 }
564
565 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500566 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700567 header->snap_names_len);
568 }
569
570 return 0;
571
Alex Elder6a523252012-07-19 17:12:59 -0500572out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500573 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500574 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500576 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500577 header->snap_names_len = 0;
Alex Elder6a523252012-07-19 17:12:59 -0500578 kfree(header->object_prefix);
579 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500580
Alex Elder00f1f362012-02-07 12:03:36 -0600581 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582}
583
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
585 u64 *seq, u64 *size)
586{
587 int i;
588 char *p = header->snap_names;
589
Alex Elder00f1f362012-02-07 12:03:36 -0600590 for (i = 0; i < header->total_snaps; i++) {
591 if (!strcmp(snap_name, p)) {
592
593 /* Found it. Pass back its id and/or size */
594
595 if (seq)
596 *seq = header->snapc->snaps[i];
597 if (size)
598 *size = header->snap_sizes[i];
599 return i;
600 }
601 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700602 }
Alex Elder00f1f362012-02-07 12:03:36 -0600603 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604}
605
Alex Elder0ce1a792012-07-03 16:01:18 -0500606static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607{
Alex Elder78dc4472012-07-19 08:49:18 -0500608 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
Alex Elder0ce1a792012-07-03 16:01:18 -0500610 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611
Alex Elder0ce1a792012-07-03 16:01:18 -0500612 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800613 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500614 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800615 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500616 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500618 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500620 u64 snap_id = 0;
621
622 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
623 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624 if (ret < 0)
625 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500626 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800627 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500628 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 }
630
631 ret = 0;
632done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500633 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634 return ret;
635}
636
637static void rbd_header_free(struct rbd_image_header *header)
638{
Alex Elder849b4262012-07-09 21:04:24 -0500639 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500640 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500642 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500643 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500644 header->snap_names = NULL;
645 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800646 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500647 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648}
649
650/*
651 * get the actual striped segment name, offset and length
652 */
653static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500654 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655 u64 ofs, u64 len,
656 char *seg_name, u64 *segofs)
657{
658 u64 seg = ofs >> header->obj_order;
659
660 if (seg_name)
661 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500662 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663
664 ofs = ofs & ((1 << header->obj_order) - 1);
665 len = min_t(u64, len, (1 << header->obj_order) - ofs);
666
667 if (segofs)
668 *segofs = ofs;
669
670 return len;
671}
672
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700673static int rbd_get_num_segments(struct rbd_image_header *header,
674 u64 ofs, u64 len)
675{
676 u64 start_seg = ofs >> header->obj_order;
677 u64 end_seg = (ofs + len - 1) >> header->obj_order;
678 return end_seg - start_seg + 1;
679}
680
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700682 * returns the size of an object in the image
683 */
684static u64 rbd_obj_bytes(struct rbd_image_header *header)
685{
686 return 1 << header->obj_order;
687}
688
689/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690 * bio helpers
691 */
692
693static void bio_chain_put(struct bio *chain)
694{
695 struct bio *tmp;
696
697 while (chain) {
698 tmp = chain;
699 chain = chain->bi_next;
700 bio_put(tmp);
701 }
702}
703
704/*
705 * zeros a bio chain, starting at specific offset
706 */
707static void zero_bio_chain(struct bio *chain, int start_ofs)
708{
709 struct bio_vec *bv;
710 unsigned long flags;
711 void *buf;
712 int i;
713 int pos = 0;
714
715 while (chain) {
716 bio_for_each_segment(bv, chain, i) {
717 if (pos + bv->bv_len > start_ofs) {
718 int remainder = max(start_ofs - pos, 0);
719 buf = bvec_kmap_irq(bv, &flags);
720 memset(buf + remainder, 0,
721 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200722 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 }
724 pos += bv->bv_len;
725 }
726
727 chain = chain->bi_next;
728 }
729}
730
731/*
732 * bio_chain_clone - clone a chain of bios up to a certain length.
733 * might return a bio_pair that will need to be released.
734 */
735static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
736 struct bio_pair **bp,
737 int len, gfp_t gfpmask)
738{
739 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
740 int total = 0;
741
742 if (*bp) {
743 bio_pair_release(*bp);
744 *bp = NULL;
745 }
746
747 while (old_chain && (total < len)) {
748 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
749 if (!tmp)
750 goto err_out;
751
752 if (total + old_chain->bi_size > len) {
753 struct bio_pair *bp;
754
755 /*
756 * this split can only happen with a single paged bio,
757 * split_bio will BUG_ON if this is not the case
758 */
759 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500760 "bi_size=%u\n",
761 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762
763 /* split the bio. We'll release it either in the next
764 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600765 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700766 if (!bp)
767 goto err_out;
768
769 __bio_clone(tmp, &bp->bio1);
770
771 *next = &bp->bio2;
772 } else {
773 __bio_clone(tmp, old_chain);
774 *next = old_chain->bi_next;
775 }
776
777 tmp->bi_bdev = NULL;
778 gfpmask &= ~__GFP_WAIT;
779 tmp->bi_next = NULL;
780
781 if (!new_chain) {
782 new_chain = tail = tmp;
783 } else {
784 tail->bi_next = tmp;
785 tail = tmp;
786 }
787 old_chain = old_chain->bi_next;
788
789 total += tmp->bi_size;
790 }
791
792 BUG_ON(total < len);
793
794 if (tail)
795 tail->bi_next = NULL;
796
797 *old = old_chain;
798
799 return new_chain;
800
801err_out:
802 dout("bio_chain_clone with err\n");
803 bio_chain_put(new_chain);
804 return NULL;
805}
806
807/*
808 * helpers for osd request op vectors.
809 */
Alex Elder57cfc102012-06-26 12:57:03 -0700810static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
811 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700812{
Alex Elder57cfc102012-06-26 12:57:03 -0700813 struct ceph_osd_req_op *ops;
814
815 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
816 if (!ops)
817 return NULL;
818
819 ops[0].op = opcode;
820
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700821 /*
822 * op extent offset and length will be set later on
823 * in calc_raw_layout()
824 */
Alex Elder57cfc102012-06-26 12:57:03 -0700825 ops[0].payload_len = payload_len;
826
827 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828}
829
830static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
831{
832 kfree(ops);
833}
834
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700835static void rbd_coll_end_req_index(struct request *rq,
836 struct rbd_req_coll *coll,
837 int index,
838 int ret, u64 len)
839{
840 struct request_queue *q;
841 int min, max, i;
842
Alex Elderbd919d42012-07-13 20:35:11 -0500843 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
844 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700845
846 if (!rq)
847 return;
848
849 if (!coll) {
850 blk_end_request(rq, ret, len);
851 return;
852 }
853
854 q = rq->q;
855
856 spin_lock_irq(q->queue_lock);
857 coll->status[index].done = 1;
858 coll->status[index].rc = ret;
859 coll->status[index].bytes = len;
860 max = min = coll->num_done;
861 while (max < coll->total && coll->status[max].done)
862 max++;
863
864 for (i = min; i<max; i++) {
865 __blk_end_request(rq, coll->status[i].rc,
866 coll->status[i].bytes);
867 coll->num_done++;
868 kref_put(&coll->kref, rbd_coll_release);
869 }
870 spin_unlock_irq(q->queue_lock);
871}
872
873static void rbd_coll_end_req(struct rbd_request *req,
874 int ret, u64 len)
875{
876 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
877}
878
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879/*
880 * Send ceph osd request
881 */
882static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500883 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 struct ceph_snap_context *snapc,
885 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500886 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 struct bio *bio,
888 struct page **pages,
889 int num_pages,
890 int flags,
891 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700892 struct rbd_req_coll *coll,
893 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700895 struct ceph_msg *msg),
896 struct ceph_osd_request **linger_req,
897 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898{
899 struct ceph_osd_request *req;
900 struct ceph_file_layout *layout;
901 int ret;
902 u64 bno;
903 struct timespec mtime = CURRENT_TIME;
904 struct rbd_request *req_data;
905 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600906 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700909 if (!req_data) {
910 if (coll)
911 rbd_coll_end_req_index(rq, coll, coll_index,
912 -ENOMEM, len);
913 return -ENOMEM;
914 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700916 if (coll) {
917 req_data->coll = coll;
918 req_data->coll_index = coll_index;
919 }
920
Alex Elderbd919d42012-07-13 20:35:11 -0500921 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
922 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700923
Alex Elder0ce1a792012-07-03 16:01:18 -0500924 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600925 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
926 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700927 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700928 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700929 goto done_pages;
930 }
931
932 req->r_callback = rbd_cb;
933
934 req_data->rq = rq;
935 req_data->bio = bio;
936 req_data->pages = pages;
937 req_data->len = len;
938
939 req->r_priv = req_data;
940
941 reqhead = req->r_request->front.iov_base;
942 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
943
Alex Elderaded07e2012-07-03 16:01:18 -0500944 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700945 req->r_oid_len = strlen(req->r_oid);
946
947 layout = &req->r_file_layout;
948 memset(layout, 0, sizeof(*layout));
949 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
950 layout->fl_stripe_count = cpu_to_le32(1);
951 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500952 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600953 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
954 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700955
956 ceph_osdc_build_request(req, ofs, &len,
957 ops,
958 snapc,
959 &mtime,
960 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700962 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600963 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700964 *linger_req = req;
965 }
966
Alex Elder1dbb4392012-01-24 10:08:37 -0600967 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 if (ret < 0)
969 goto done_err;
970
971 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600972 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700973 if (ver)
974 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500975 dout("reassert_ver=%llu\n",
976 (unsigned long long)
977 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978 ceph_osdc_put_request(req);
979 }
980 return ret;
981
982done_err:
983 bio_chain_put(req_data->bio);
984 ceph_osdc_put_request(req);
985done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700986 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988 return ret;
989}
990
991/*
992 * Ceph osd op callback
993 */
994static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
995{
996 struct rbd_request *req_data = req->r_priv;
997 struct ceph_osd_reply_head *replyhead;
998 struct ceph_osd_op *op;
999 __s32 rc;
1000 u64 bytes;
1001 int read_op;
1002
1003 /* parse reply */
1004 replyhead = msg->front.iov_base;
1005 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1006 op = (void *)(replyhead + 1);
1007 rc = le32_to_cpu(replyhead->result);
1008 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001009 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001010
Alex Elderbd919d42012-07-13 20:35:11 -05001011 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1012 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013
1014 if (rc == -ENOENT && read_op) {
1015 zero_bio_chain(req_data->bio, 0);
1016 rc = 0;
1017 } else if (rc == 0 && read_op && bytes < req_data->len) {
1018 zero_bio_chain(req_data->bio, bytes);
1019 bytes = req_data->len;
1020 }
1021
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001022 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023
1024 if (req_data->bio)
1025 bio_chain_put(req_data->bio);
1026
1027 ceph_osdc_put_request(req);
1028 kfree(req_data);
1029}
1030
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001031static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1032{
1033 ceph_osdc_put_request(req);
1034}
1035
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036/*
1037 * Do a synchronous ceph osd operation
1038 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001039static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040 struct ceph_snap_context *snapc,
1041 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001043 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001044 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001046 char *buf,
1047 struct ceph_osd_request **linger_req,
1048 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001049{
1050 int ret;
1051 struct page **pages;
1052 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001053
1054 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055
1056 num_pages = calc_pages_for(ofs , len);
1057 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001058 if (IS_ERR(pages))
1059 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001060
Alex Elder0ce1a792012-07-03 16:01:18 -05001061 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001062 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 pages, num_pages,
1064 flags,
1065 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001066 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001067 NULL,
1068 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001069 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001070 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071
1072 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1073 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1074
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075done:
1076 ceph_release_page_vector(pages, num_pages);
1077 return ret;
1078}
1079
1080/*
1081 * Do an asynchronous ceph osd operation
1082 */
1083static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001084 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085 struct ceph_snap_context *snapc,
1086 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001087 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001089 struct bio *bio,
1090 struct rbd_req_coll *coll,
1091 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092{
1093 char *seg_name;
1094 u64 seg_ofs;
1095 u64 seg_len;
1096 int ret;
1097 struct ceph_osd_req_op *ops;
1098 u32 payload_len;
1099
1100 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1101 if (!seg_name)
1102 return -ENOMEM;
1103
1104 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001105 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106 ofs, len,
1107 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108
1109 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1110
Alex Elder57cfc102012-06-26 12:57:03 -07001111 ret = -ENOMEM;
1112 ops = rbd_create_rw_ops(1, opcode, payload_len);
1113 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114 goto done;
1115
1116 /* we've taken care of segment sizes earlier when we
1117 cloned the bios. We should never have a segment
1118 truncated at this point */
1119 BUG_ON(seg_len < len);
1120
1121 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1122 seg_name, seg_ofs, seg_len,
1123 bio,
1124 NULL, 0,
1125 flags,
1126 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001127 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001128 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001129
1130 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131done:
1132 kfree(seg_name);
1133 return ret;
1134}
1135
1136/*
1137 * Request async osd write
1138 */
1139static int rbd_req_write(struct request *rq,
1140 struct rbd_device *rbd_dev,
1141 struct ceph_snap_context *snapc,
1142 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001143 struct bio *bio,
1144 struct rbd_req_coll *coll,
1145 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001146{
1147 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1148 CEPH_OSD_OP_WRITE,
1149 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001150 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001151}
1152
1153/*
1154 * Request async osd read
1155 */
1156static int rbd_req_read(struct request *rq,
1157 struct rbd_device *rbd_dev,
1158 u64 snapid,
1159 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001160 struct bio *bio,
1161 struct rbd_req_coll *coll,
1162 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163{
1164 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001165 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166 CEPH_OSD_OP_READ,
1167 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001168 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169}
1170
1171/*
1172 * Request sync osd read
1173 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001174static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001176 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001178 char *buf,
1179 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180{
Alex Elder913d2fd2012-06-26 12:57:03 -07001181 struct ceph_osd_req_op *ops;
1182 int ret;
1183
1184 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1185 if (!ops)
1186 return -ENOMEM;
1187
1188 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001189 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001190 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001191 ops, object_name, ofs, len, buf, NULL, ver);
1192 rbd_destroy_ops(ops);
1193
1194 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195}
1196
1197/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001198 * Request sync osd watch
1199 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001200static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001201 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001202 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001203{
1204 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001205 int ret;
1206
Alex Elder57cfc102012-06-26 12:57:03 -07001207 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1208 if (!ops)
1209 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001210
Josh Durgina71b8912011-12-05 18:10:44 -08001211 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001212 ops[0].watch.cookie = notify_id;
1213 ops[0].watch.flag = 0;
1214
Alex Elder0ce1a792012-07-03 16:01:18 -05001215 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001216 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001217 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001218 CEPH_OSD_FLAG_READ,
1219 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001220 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001221 rbd_simple_req_cb, 0, NULL);
1222
1223 rbd_destroy_ops(ops);
1224 return ret;
1225}
1226
1227static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1228{
Alex Elder0ce1a792012-07-03 16:01:18 -05001229 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001230 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001231 int rc;
1232
Alex Elder0ce1a792012-07-03 16:01:18 -05001233 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001234 return;
1235
Alex Elderbd919d42012-07-13 20:35:11 -05001236 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1237 rbd_dev->header_name, (unsigned long long) notify_id,
1238 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001239 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001240 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001241 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001242 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001243
Alex Elder7f0a24d2012-07-25 09:32:40 -05001244 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001245}
1246
1247/*
1248 * Request sync osd watch
1249 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001250static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251{
1252 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001253 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001254 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255
Alex Elder57cfc102012-06-26 12:57:03 -07001256 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1257 if (!ops)
1258 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001259
1260 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001261 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262 if (ret < 0)
1263 goto fail;
1264
Alex Elder0e6f3222012-07-25 09:32:40 -05001265 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001266 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001267 ops[0].watch.flag = 1;
1268
Alex Elder0ce1a792012-07-03 16:01:18 -05001269 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001270 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1272 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001273 rbd_dev->header_name,
1274 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001275 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276
1277 if (ret < 0)
1278 goto fail_event;
1279
1280 rbd_destroy_ops(ops);
1281 return 0;
1282
1283fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001284 ceph_osdc_cancel_event(rbd_dev->watch_event);
1285 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286fail:
1287 rbd_destroy_ops(ops);
1288 return ret;
1289}
1290
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001291/*
1292 * Request sync osd unwatch
1293 */
Alex Elder070c6332012-07-25 09:32:41 -05001294static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001295{
1296 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001297 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001298
Alex Elder57cfc102012-06-26 12:57:03 -07001299 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1300 if (!ops)
1301 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001302
1303 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001304 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001305 ops[0].watch.flag = 0;
1306
Alex Elder0ce1a792012-07-03 16:01:18 -05001307 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001308 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001309 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1310 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001311 rbd_dev->header_name,
1312 0, 0, NULL, NULL, NULL);
1313
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001314
1315 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001316 ceph_osdc_cancel_event(rbd_dev->watch_event);
1317 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001318 return ret;
1319}
1320
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001322 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001323};
1324
1325static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1326{
Alex Elder0ce1a792012-07-03 16:01:18 -05001327 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1328 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001329 return;
1330
Alex Elderbd919d42012-07-13 20:35:11 -05001331 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1332 rbd_dev->header_name, (unsigned long long) notify_id,
1333 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001334}
1335
1336/*
1337 * Request sync osd notify
1338 */
Alex Elder4cb16252012-07-25 09:32:40 -05001339static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340{
1341 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001342 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343 struct ceph_osd_event *event;
1344 struct rbd_notify_info info;
1345 int payload_len = sizeof(u32) + sizeof(u32);
1346 int ret;
1347
Alex Elder57cfc102012-06-26 12:57:03 -07001348 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1349 if (!ops)
1350 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351
Alex Elder0ce1a792012-07-03 16:01:18 -05001352 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001353
1354 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1355 (void *)&info, &event);
1356 if (ret < 0)
1357 goto fail;
1358
1359 ops[0].watch.ver = 1;
1360 ops[0].watch.flag = 1;
1361 ops[0].watch.cookie = event->cookie;
1362 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1363 ops[0].watch.timeout = 12;
1364
Alex Elder0ce1a792012-07-03 16:01:18 -05001365 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001367 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1368 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001369 rbd_dev->header_name,
1370 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001371 if (ret < 0)
1372 goto fail_event;
1373
1374 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1375 dout("ceph_osdc_wait_event returned %d\n", ret);
1376 rbd_destroy_ops(ops);
1377 return 0;
1378
1379fail_event:
1380 ceph_osdc_cancel_event(event);
1381fail:
1382 rbd_destroy_ops(ops);
1383 return ret;
1384}
1385
1386/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001387 * Request sync osd read
1388 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001389static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001390 const char *object_name,
1391 const char *class_name,
1392 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001393 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 int len,
1395 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001396{
1397 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001398 int class_name_len = strlen(class_name);
1399 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001400 int ret;
1401
1402 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001403 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001404 if (!ops)
1405 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001406
Alex Elderaded07e2012-07-03 16:01:18 -05001407 ops[0].cls.class_name = class_name;
1408 ops[0].cls.class_len = (__u8) class_name_len;
1409 ops[0].cls.method_name = method_name;
1410 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001411 ops[0].cls.argc = 0;
1412 ops[0].cls.indata = data;
1413 ops[0].cls.indata_len = len;
1414
Alex Elder0ce1a792012-07-03 16:01:18 -05001415 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001416 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1418 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001419 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001420
1421 rbd_destroy_ops(ops);
1422
1423 dout("cls_exec returned %d\n", ret);
1424 return ret;
1425}
1426
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001427static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1428{
1429 struct rbd_req_coll *coll =
1430 kzalloc(sizeof(struct rbd_req_coll) +
1431 sizeof(struct rbd_req_status) * num_reqs,
1432 GFP_ATOMIC);
1433
1434 if (!coll)
1435 return NULL;
1436 coll->total = num_reqs;
1437 kref_init(&coll->kref);
1438 return coll;
1439}
1440
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001441/*
1442 * block device queue callback
1443 */
1444static void rbd_rq_fn(struct request_queue *q)
1445{
1446 struct rbd_device *rbd_dev = q->queuedata;
1447 struct request *rq;
1448 struct bio_pair *bp = NULL;
1449
Alex Elder00f1f362012-02-07 12:03:36 -06001450 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001451 struct bio *bio;
1452 struct bio *rq_bio, *next_bio = NULL;
1453 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001454 unsigned int size;
1455 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001457 int num_segs, cur_seg = 0;
1458 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001459 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001460
1461 /* peek at request from block layer */
1462 if (!rq)
1463 break;
1464
1465 dout("fetched request\n");
1466
1467 /* filter out block requests we don't understand */
1468 if ((rq->cmd_type != REQ_TYPE_FS)) {
1469 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001470 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001471 }
1472
1473 /* deduce our operation (read, write) */
1474 do_write = (rq_data_dir(rq) == WRITE);
1475
1476 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001477 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001478 rq_bio = rq->bio;
1479 if (do_write && rbd_dev->read_only) {
1480 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001481 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 }
1483
1484 spin_unlock_irq(q->queue_lock);
1485
Josh Durgind1d25642011-12-05 14:03:05 -08001486 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001487
Josh Durgind1d25642011-12-05 14:03:05 -08001488 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001489 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001490 dout("request for non-existent snapshot");
1491 spin_lock_irq(q->queue_lock);
1492 __blk_end_request_all(rq, -ENXIO);
1493 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001494 }
1495
Josh Durgind1d25642011-12-05 14:03:05 -08001496 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1497
1498 up_read(&rbd_dev->header_rwsem);
1499
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001500 dout("%s 0x%x bytes at 0x%llx\n",
1501 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001502 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001503
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001504 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1505 coll = rbd_alloc_coll(num_segs);
1506 if (!coll) {
1507 spin_lock_irq(q->queue_lock);
1508 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001509 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001510 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001511 }
1512
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513 do {
1514 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001515 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001516 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001517 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518 ofs, size,
1519 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001520 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1522 op_size, GFP_ATOMIC);
1523 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 rbd_coll_end_req_index(rq, coll, cur_seg,
1525 -ENOMEM, op_size);
1526 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001527 }
1528
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 /* init OSD command: write or read */
1531 if (do_write)
1532 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001533 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001535 op_size, bio,
1536 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537 else
1538 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001539 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 op_size, bio,
1542 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001544next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 size -= op_size;
1546 ofs += op_size;
1547
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001548 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 rq_bio = next_bio;
1550 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001551 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552
1553 if (bp)
1554 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001555 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001556
1557 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 }
1559}
1560
1561/*
1562 * a queue callback. Makes sure that we don't create a bio that spans across
1563 * multiple osd objects. One exception would be with a single page bios,
1564 * which we handle later at bio_chain_clone
1565 */
1566static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1567 struct bio_vec *bvec)
1568{
1569 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001570 unsigned int chunk_sectors;
1571 sector_t sector;
1572 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 int max;
1574
Alex Elder593a9e72012-02-07 12:03:37 -06001575 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1576 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1577 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1578
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001579 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001580 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 if (max < 0)
1582 max = 0; /* bio_add cannot handle a negative return */
1583 if (max <= bvec->bv_len && bio_sectors == 0)
1584 return bvec->bv_len;
1585 return max;
1586}
1587
1588static void rbd_free_disk(struct rbd_device *rbd_dev)
1589{
1590 struct gendisk *disk = rbd_dev->disk;
1591
1592 if (!disk)
1593 return;
1594
1595 rbd_header_free(&rbd_dev->header);
1596
1597 if (disk->flags & GENHD_FL_UP)
1598 del_gendisk(disk);
1599 if (disk->queue)
1600 blk_cleanup_queue(disk->queue);
1601 put_disk(disk);
1602}
1603
1604/*
1605 * reload the ondisk the header
1606 */
1607static int rbd_read_header(struct rbd_device *rbd_dev,
1608 struct rbd_image_header *header)
1609{
1610 ssize_t rc;
1611 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001612 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001613 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001614 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615
Alex Elder00f1f362012-02-07 12:03:36 -06001616 /*
1617 * First reads the fixed-size header to determine the number
1618 * of snapshots, then re-reads it, along with all snapshot
1619 * records as well as their stored names.
1620 */
1621 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001622 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001623 dh = kmalloc(len, GFP_KERNEL);
1624 if (!dh)
1625 return -ENOMEM;
1626
1627 rc = rbd_req_sync_read(rbd_dev,
Alex Elder9a5d6902012-07-19 09:09:27 -05001628 CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001629 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001630 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001631 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001632 if (rc < 0)
1633 goto out_dh;
1634
Alex Eldered63f4f2012-07-19 09:09:27 -05001635 rc = rbd_header_from_disk(header, dh, snap_count);
Josh Durgin81e759f2011-11-15 14:49:53 -08001636 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001637 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001638 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001639 " for image %s\n",
1640 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001642 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001643
Alex Elder00f1f362012-02-07 12:03:36 -06001644 if (snap_count == header->total_snaps)
1645 break;
1646
1647 snap_count = header->total_snaps;
1648 len = sizeof (*dh) +
1649 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1650 header->snap_names_len;
1651
1652 rbd_header_free(header);
1653 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001654 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001655 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001656
1657out_dh:
1658 kfree(dh);
1659 return rc;
1660}
1661
1662/*
1663 * create a snapshot
1664 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001665static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001666 const char *snap_name,
1667 gfp_t gfp_flags)
1668{
1669 int name_len = strlen(snap_name);
1670 u64 new_snapid;
1671 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001672 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001673 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674
1675 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001676 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001677 return -EINVAL;
1678
Alex Elder0ce1a792012-07-03 16:01:18 -05001679 monc = &rbd_dev->rbd_client->client->monc;
1680 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001681 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001682 if (ret < 0)
1683 return ret;
1684
1685 data = kmalloc(name_len + 16, gfp_flags);
1686 if (!data)
1687 return -ENOMEM;
1688
Sage Weil916d4d62011-05-12 16:10:50 -07001689 p = data;
1690 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001691
Sage Weil916d4d62011-05-12 16:10:50 -07001692 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1693 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001694
Alex Elder0bed54d2012-07-03 16:01:18 -05001695 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001696 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001697 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001698
Sage Weil916d4d62011-05-12 16:10:50 -07001699 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001700
Alex Elder505cbb92012-07-19 08:49:18 -05001701 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001702bad:
1703 return -ERANGE;
1704}
1705
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001706static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1707{
1708 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001709 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001710
Alex Eldera0593292012-07-19 09:09:27 -05001711 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001712 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001713}
1714
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001715/*
1716 * only read the first part of the ondisk header, without the snaps info
1717 */
Alex Elderb8136232012-07-25 09:32:41 -05001718static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001719{
1720 int ret;
1721 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001722
1723 ret = rbd_read_header(rbd_dev, &h);
1724 if (ret < 0)
1725 return ret;
1726
Josh Durgina51aa0c2011-12-05 10:35:04 -08001727 down_write(&rbd_dev->header_rwsem);
1728
Sage Weil9db4b3e2011-04-19 22:49:06 -07001729 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001730 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1731 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1732
1733 dout("setting size to %llu sectors", (unsigned long long) size);
1734 set_capacity(rbd_dev->disk, size);
1735 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001736
Alex Elder849b4262012-07-09 21:04:24 -05001737 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001738 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001739 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001740 /* osd requests may still refer to snapc */
1741 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001742
Alex Elderb8136232012-07-25 09:32:41 -05001743 if (hver)
1744 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001745 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001746 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001747 rbd_dev->header.total_snaps = h.total_snaps;
1748 rbd_dev->header.snapc = h.snapc;
1749 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001750 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001751 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001752 /* Free the extra copy of the object prefix */
1753 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1754 kfree(h.object_prefix);
1755
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001756 ret = __rbd_init_snaps_header(rbd_dev);
1757
Josh Durginc6666012011-11-21 17:11:12 -08001758 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001759
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001760 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761}
1762
Alex Elder1fe5e992012-07-25 09:32:41 -05001763static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1764{
1765 int ret;
1766
1767 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1768 ret = __rbd_refresh_header(rbd_dev, hver);
1769 mutex_unlock(&ctl_mutex);
1770
1771 return ret;
1772}
1773
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774static int rbd_init_disk(struct rbd_device *rbd_dev)
1775{
1776 struct gendisk *disk;
1777 struct request_queue *q;
1778 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001779 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780 u64 total_size = 0;
1781
1782 /* contact OSD, request size info about the object being mapped */
1783 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1784 if (rc)
1785 return rc;
1786
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001787 /* no need to lock here, as rbd_dev is not registered yet */
1788 rc = __rbd_init_snaps_header(rbd_dev);
1789 if (rc)
1790 return rc;
1791
Josh Durgincc9d7342011-11-21 18:19:13 -08001792 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001793 if (rc)
1794 return rc;
1795
1796 /* create gendisk info */
1797 rc = -ENOMEM;
1798 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1799 if (!disk)
1800 goto out;
1801
Alex Elderf0f8cef2012-01-29 13:57:44 -06001802 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001803 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804 disk->major = rbd_dev->major;
1805 disk->first_minor = 0;
1806 disk->fops = &rbd_bd_ops;
1807 disk->private_data = rbd_dev;
1808
1809 /* init rq */
1810 rc = -ENOMEM;
1811 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1812 if (!q)
1813 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001814
Alex Elder593a9e72012-02-07 12:03:37 -06001815 /* We use the default size, but let's be explicit about it. */
1816 blk_queue_physical_block_size(q, SECTOR_SIZE);
1817
Josh Durgin029bcbd2011-07-22 11:35:23 -07001818 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001819 segment_size = rbd_obj_bytes(&rbd_dev->header);
1820 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1821 blk_queue_max_segment_size(q, segment_size);
1822 blk_queue_io_min(q, segment_size);
1823 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001824
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001825 blk_queue_merge_bvec(q, rbd_merge_bvec);
1826 disk->queue = q;
1827
1828 q->queuedata = rbd_dev;
1829
1830 rbd_dev->disk = disk;
1831 rbd_dev->q = q;
1832
1833 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001834 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001835 add_disk(disk);
1836
1837 pr_info("%s: added with size 0x%llx\n",
1838 disk->disk_name, (unsigned long long)total_size);
1839 return 0;
1840
1841out_disk:
1842 put_disk(disk);
1843out:
1844 return rc;
1845}
1846
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001847/*
1848 sysfs
1849*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001850
Alex Elder593a9e72012-02-07 12:03:37 -06001851static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1852{
1853 return container_of(dev, struct rbd_device, dev);
1854}
1855
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001856static ssize_t rbd_size_show(struct device *dev,
1857 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001858{
Alex Elder593a9e72012-02-07 12:03:37 -06001859 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001860 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001861
Josh Durgina51aa0c2011-12-05 10:35:04 -08001862 down_read(&rbd_dev->header_rwsem);
1863 size = get_capacity(rbd_dev->disk);
1864 up_read(&rbd_dev->header_rwsem);
1865
1866 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867}
1868
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001869static ssize_t rbd_major_show(struct device *dev,
1870 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871{
Alex Elder593a9e72012-02-07 12:03:37 -06001872 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001873
1874 return sprintf(buf, "%d\n", rbd_dev->major);
1875}
1876
1877static ssize_t rbd_client_id_show(struct device *dev,
1878 struct device_attribute *attr, char *buf)
1879{
Alex Elder593a9e72012-02-07 12:03:37 -06001880 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001881
Alex Elder1dbb4392012-01-24 10:08:37 -06001882 return sprintf(buf, "client%lld\n",
1883 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884}
1885
1886static ssize_t rbd_pool_show(struct device *dev,
1887 struct device_attribute *attr, char *buf)
1888{
Alex Elder593a9e72012-02-07 12:03:37 -06001889 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890
1891 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1892}
1893
Alex Elder9bb2f332012-07-12 10:46:35 -05001894static ssize_t rbd_pool_id_show(struct device *dev,
1895 struct device_attribute *attr, char *buf)
1896{
1897 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1898
1899 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1900}
1901
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001902static ssize_t rbd_name_show(struct device *dev,
1903 struct device_attribute *attr, char *buf)
1904{
Alex Elder593a9e72012-02-07 12:03:37 -06001905 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001906
Alex Elder0bed54d2012-07-03 16:01:18 -05001907 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908}
1909
1910static ssize_t rbd_snap_show(struct device *dev,
1911 struct device_attribute *attr,
1912 char *buf)
1913{
Alex Elder593a9e72012-02-07 12:03:37 -06001914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001915
1916 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1917}
1918
1919static ssize_t rbd_image_refresh(struct device *dev,
1920 struct device_attribute *attr,
1921 const char *buf,
1922 size_t size)
1923{
Alex Elder593a9e72012-02-07 12:03:37 -06001924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001925 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926
Alex Elder1fe5e992012-07-25 09:32:41 -05001927 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001928
1929 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001930}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001932static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1933static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1934static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1935static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001936static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001937static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1938static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1939static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1940static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941
1942static struct attribute *rbd_attrs[] = {
1943 &dev_attr_size.attr,
1944 &dev_attr_major.attr,
1945 &dev_attr_client_id.attr,
1946 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001947 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001948 &dev_attr_name.attr,
1949 &dev_attr_current_snap.attr,
1950 &dev_attr_refresh.attr,
1951 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001952 NULL
1953};
1954
1955static struct attribute_group rbd_attr_group = {
1956 .attrs = rbd_attrs,
1957};
1958
1959static const struct attribute_group *rbd_attr_groups[] = {
1960 &rbd_attr_group,
1961 NULL
1962};
1963
1964static void rbd_sysfs_dev_release(struct device *dev)
1965{
1966}
1967
1968static struct device_type rbd_device_type = {
1969 .name = "rbd",
1970 .groups = rbd_attr_groups,
1971 .release = rbd_sysfs_dev_release,
1972};
1973
1974
1975/*
1976 sysfs - snapshots
1977*/
1978
1979static ssize_t rbd_snap_size_show(struct device *dev,
1980 struct device_attribute *attr,
1981 char *buf)
1982{
1983 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1984
Josh Durgin35915382011-12-05 18:25:13 -08001985 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001986}
1987
1988static ssize_t rbd_snap_id_show(struct device *dev,
1989 struct device_attribute *attr,
1990 char *buf)
1991{
1992 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993
Josh Durgin35915382011-12-05 18:25:13 -08001994 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995}
1996
1997static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1998static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1999
2000static struct attribute *rbd_snap_attrs[] = {
2001 &dev_attr_snap_size.attr,
2002 &dev_attr_snap_id.attr,
2003 NULL,
2004};
2005
2006static struct attribute_group rbd_snap_attr_group = {
2007 .attrs = rbd_snap_attrs,
2008};
2009
2010static void rbd_snap_dev_release(struct device *dev)
2011{
2012 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2013 kfree(snap->name);
2014 kfree(snap);
2015}
2016
2017static const struct attribute_group *rbd_snap_attr_groups[] = {
2018 &rbd_snap_attr_group,
2019 NULL
2020};
2021
2022static struct device_type rbd_snap_device_type = {
2023 .groups = rbd_snap_attr_groups,
2024 .release = rbd_snap_dev_release,
2025};
2026
Alex Elder14e70852012-07-19 09:09:27 -05002027static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028{
2029 list_del(&snap->node);
2030 device_unregister(&snap->dev);
2031}
2032
Alex Elder14e70852012-07-19 09:09:27 -05002033static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034 struct device *parent)
2035{
2036 struct device *dev = &snap->dev;
2037 int ret;
2038
2039 dev->type = &rbd_snap_device_type;
2040 dev->parent = parent;
2041 dev->release = rbd_snap_dev_release;
2042 dev_set_name(dev, "snap_%s", snap->name);
2043 ret = device_register(dev);
2044
2045 return ret;
2046}
2047
Alex Elder4e891e02012-07-10 20:30:10 -05002048static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2049 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002050{
Alex Elder4e891e02012-07-10 20:30:10 -05002051 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002052 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002053
2054 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002055 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002056 return ERR_PTR(-ENOMEM);
2057
2058 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002060 if (!snap->name)
2061 goto err;
2062
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002063 snap->size = rbd_dev->header.snap_sizes[i];
2064 snap->id = rbd_dev->header.snapc->snaps[i];
2065 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002066 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002067 if (ret < 0)
2068 goto err;
2069 }
Alex Elder4e891e02012-07-10 20:30:10 -05002070
2071 return snap;
2072
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002073err:
2074 kfree(snap->name);
2075 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002076
2077 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002078}
2079
2080/*
Alex Elder35938152012-08-02 11:29:46 -05002081 * Scan the rbd device's current snapshot list and compare it to the
2082 * newly-received snapshot context. Remove any existing snapshots
2083 * not present in the new snapshot context. Add a new snapshot for
2084 * any snaphots in the snapshot context not in the current list.
2085 * And verify there are no changes to snapshots we already know
2086 * about.
2087 *
2088 * Assumes the snapshots in the snapshot context are sorted by
2089 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2090 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091 */
2092static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2093{
Alex Elder35938152012-08-02 11:29:46 -05002094 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2095 const u32 snap_count = snapc->num_snaps;
2096 char *snap_name = rbd_dev->header.snap_names;
2097 struct list_head *head = &rbd_dev->snaps;
2098 struct list_head *links = head->next;
2099 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002100
Alex Elder35938152012-08-02 11:29:46 -05002101 while (index < snap_count || links != head) {
2102 u64 snap_id;
2103 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002104
Alex Elder35938152012-08-02 11:29:46 -05002105 snap_id = index < snap_count ? snapc->snaps[index]
2106 : CEPH_NOSNAP;
2107 snap = links != head ? list_entry(links, struct rbd_snap, node)
2108 : NULL;
2109 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110
Alex Elder35938152012-08-02 11:29:46 -05002111 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2112 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002113
Alex Elder35938152012-08-02 11:29:46 -05002114 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115
Alex Elder35938152012-08-02 11:29:46 -05002116 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002117 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002118 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119
Alex Elder35938152012-08-02 11:29:46 -05002120 /* Done with this list entry; advance */
2121
2122 links = next;
2123 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002124 }
Alex Elder35938152012-08-02 11:29:46 -05002125
2126 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2127 struct rbd_snap *new_snap;
2128
2129 /* We haven't seen this snapshot before */
2130
2131 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2132 snap_name);
2133 if (IS_ERR(new_snap))
2134 return PTR_ERR(new_snap);
2135
2136 /* New goes before existing, or at end of list */
2137
2138 if (snap)
2139 list_add_tail(&new_snap->node, &snap->node);
2140 else
2141 list_add(&new_snap->node, head);
2142 } else {
2143 /* Already have this one */
2144
2145 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2146 BUG_ON(strcmp(snap->name, snap_name));
2147
2148 /* Done with this list entry; advance */
2149
2150 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151 }
Alex Elder35938152012-08-02 11:29:46 -05002152
2153 /* Advance to the next entry in the snapshot context */
2154
2155 index++;
2156 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157 }
2158
2159 return 0;
2160}
2161
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2163{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002164 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165 struct device *dev;
2166 struct rbd_snap *snap;
2167
2168 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2169 dev = &rbd_dev->dev;
2170
2171 dev->bus = &rbd_bus_type;
2172 dev->type = &rbd_device_type;
2173 dev->parent = &rbd_root_dev;
2174 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002175 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176 ret = device_register(dev);
2177 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002178 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179
2180 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002181 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002182 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002183 break;
2184 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002185out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002186 mutex_unlock(&ctl_mutex);
2187 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002188}
2189
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2191{
2192 device_unregister(&rbd_dev->dev);
2193}
2194
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002195static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2196{
2197 int ret, rc;
2198
2199 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002200 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002201 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002202 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002203 if (rc < 0)
2204 return rc;
2205 }
2206 } while (ret == -ERANGE);
2207
2208 return ret;
2209}
2210
Alex Elder1ddbe942012-01-29 13:57:44 -06002211static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2212
2213/*
Alex Elder499afd52012-02-02 08:13:29 -06002214 * Get a unique rbd identifier for the given new rbd_dev, and add
2215 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002216 */
Alex Elder499afd52012-02-02 08:13:29 -06002217static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002218{
Alex Elderde71a292012-07-03 16:01:19 -05002219 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002220
2221 spin_lock(&rbd_dev_list_lock);
2222 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2223 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002224}
Alex Elderb7f23c32012-01-29 13:57:43 -06002225
Alex Elder1ddbe942012-01-29 13:57:44 -06002226/*
Alex Elder499afd52012-02-02 08:13:29 -06002227 * Remove an rbd_dev from the global list, and record that its
2228 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002229 */
Alex Elder499afd52012-02-02 08:13:29 -06002230static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002231{
Alex Elderd184f6b2012-01-29 13:57:44 -06002232 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002233 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002234 int max_id;
2235
2236 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002237
2238 spin_lock(&rbd_dev_list_lock);
2239 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002240
2241 /*
2242 * If the id being "put" is not the current maximum, there
2243 * is nothing special we need to do.
2244 */
2245 if (rbd_id != atomic64_read(&rbd_id_max)) {
2246 spin_unlock(&rbd_dev_list_lock);
2247 return;
2248 }
2249
2250 /*
2251 * We need to update the current maximum id. Search the
2252 * list to find out what it is. We're more likely to find
2253 * the maximum at the end, so search the list backward.
2254 */
2255 max_id = 0;
2256 list_for_each_prev(tmp, &rbd_dev_list) {
2257 struct rbd_device *rbd_dev;
2258
2259 rbd_dev = list_entry(tmp, struct rbd_device, node);
2260 if (rbd_id > max_id)
2261 max_id = rbd_id;
2262 }
Alex Elder499afd52012-02-02 08:13:29 -06002263 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002264
Alex Elder1ddbe942012-01-29 13:57:44 -06002265 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002266 * The max id could have been updated by rbd_id_get(), in
2267 * which case it now accurately reflects the new maximum.
2268 * Be careful not to overwrite the maximum value in that
2269 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002270 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002271 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002272}
2273
Alex Eldera725f65e2012-02-02 08:13:30 -06002274/*
Alex Eldere28fff262012-02-02 08:13:30 -06002275 * Skips over white space at *buf, and updates *buf to point to the
2276 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002277 * the token (string of non-white space characters) found. Note
2278 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002279 */
2280static inline size_t next_token(const char **buf)
2281{
2282 /*
2283 * These are the characters that produce nonzero for
2284 * isspace() in the "C" and "POSIX" locales.
2285 */
2286 const char *spaces = " \f\n\r\t\v";
2287
2288 *buf += strspn(*buf, spaces); /* Find start of token */
2289
2290 return strcspn(*buf, spaces); /* Return token length */
2291}
2292
2293/*
2294 * Finds the next token in *buf, and if the provided token buffer is
2295 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002296 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2297 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002298 *
2299 * Returns the length of the token found (not including the '\0').
2300 * Return value will be 0 if no token is found, and it will be >=
2301 * token_size if the token would not fit.
2302 *
Alex Elder593a9e72012-02-07 12:03:37 -06002303 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002304 * found token. Note that this occurs even if the token buffer is
2305 * too small to hold it.
2306 */
2307static inline size_t copy_token(const char **buf,
2308 char *token,
2309 size_t token_size)
2310{
2311 size_t len;
2312
2313 len = next_token(buf);
2314 if (len < token_size) {
2315 memcpy(token, *buf, len);
2316 *(token + len) = '\0';
2317 }
2318 *buf += len;
2319
2320 return len;
2321}
2322
2323/*
Alex Elderea3352f2012-07-09 21:04:23 -05002324 * Finds the next token in *buf, dynamically allocates a buffer big
2325 * enough to hold a copy of it, and copies the token into the new
2326 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2327 * that a duplicate buffer is created even for a zero-length token.
2328 *
2329 * Returns a pointer to the newly-allocated duplicate, or a null
2330 * pointer if memory for the duplicate was not available. If
2331 * the lenp argument is a non-null pointer, the length of the token
2332 * (not including the '\0') is returned in *lenp.
2333 *
2334 * If successful, the *buf pointer will be updated to point beyond
2335 * the end of the found token.
2336 *
2337 * Note: uses GFP_KERNEL for allocation.
2338 */
2339static inline char *dup_token(const char **buf, size_t *lenp)
2340{
2341 char *dup;
2342 size_t len;
2343
2344 len = next_token(buf);
2345 dup = kmalloc(len + 1, GFP_KERNEL);
2346 if (!dup)
2347 return NULL;
2348
2349 memcpy(dup, *buf, len);
2350 *(dup + len) = '\0';
2351 *buf += len;
2352
2353 if (lenp)
2354 *lenp = len;
2355
2356 return dup;
2357}
2358
2359/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002360 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002361 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2362 * on the list of monitor addresses and other options provided via
2363 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002364 *
2365 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002366 */
2367static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2368 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002369 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002370 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002371 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002372 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002373{
Alex Elderd22f76e2012-07-12 10:46:35 -05002374 size_t len;
2375 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002376
2377 /* The first four tokens are required */
2378
Alex Elder7ef32142012-02-02 08:13:30 -06002379 len = next_token(&buf);
2380 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002381 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002382 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002383 *mon_addrs = buf;
2384
2385 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002386
Alex Eldere28fff262012-02-02 08:13:30 -06002387 len = copy_token(&buf, options, options_size);
2388 if (!len || len >= options_size)
2389 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002390
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002391 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002392 rbd_dev->pool_name = dup_token(&buf, NULL);
2393 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002394 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002395
Alex Elder0bed54d2012-07-03 16:01:18 -05002396 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2397 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002398 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002399
Alex Eldercb8627c2012-07-09 21:04:23 -05002400 /* Create the name of the header object */
2401
Alex Elder0bed54d2012-07-03 16:01:18 -05002402 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002403 + sizeof (RBD_SUFFIX),
2404 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002405 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002406 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002407 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002408
Alex Eldere28fff262012-02-02 08:13:30 -06002409 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002410 * The snapshot name is optional. If none is is supplied,
2411 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002412 */
Alex Elder820a5f32012-07-09 21:04:24 -05002413 rbd_dev->snap_name = dup_token(&buf, &len);
2414 if (!rbd_dev->snap_name)
2415 goto out_err;
2416 if (!len) {
2417 /* Replace the empty name with the default */
2418 kfree(rbd_dev->snap_name);
2419 rbd_dev->snap_name
2420 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2421 if (!rbd_dev->snap_name)
2422 goto out_err;
2423
Alex Eldere28fff262012-02-02 08:13:30 -06002424 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2425 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002426 }
Alex Eldere28fff262012-02-02 08:13:30 -06002427
Alex Eldera725f65e2012-02-02 08:13:30 -06002428 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002429
2430out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002431 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002432 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002433 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002434 rbd_dev->image_name = NULL;
2435 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002436 kfree(rbd_dev->pool_name);
2437 rbd_dev->pool_name = NULL;
2438
2439 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002440}
2441
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002442static ssize_t rbd_add(struct bus_type *bus,
2443 const char *buf,
2444 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002445{
Alex Eldercb8627c2012-07-09 21:04:23 -05002446 char *options;
2447 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002448 const char *mon_addrs = NULL;
2449 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002450 struct ceph_osd_client *osdc;
2451 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002452
2453 if (!try_module_get(THIS_MODULE))
2454 return -ENODEV;
2455
Alex Elder27cc2592012-02-02 08:13:30 -06002456 options = kmalloc(count, GFP_KERNEL);
2457 if (!options)
2458 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002459 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2460 if (!rbd_dev)
2461 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462
2463 /* static rbd_device initialization */
2464 spin_lock_init(&rbd_dev->lock);
2465 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002466 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002467 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468
Alex Elderd184f6b2012-01-29 13:57:44 -06002469 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002470 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471
Alex Eldera725f65e2012-02-02 08:13:30 -06002472 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002473 BUILD_BUG_ON(DEV_NAME_LEN
2474 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002475 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002476
Alex Eldera725f65e2012-02-02 08:13:30 -06002477 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002478 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002479 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002480 if (rc)
2481 goto err_put_id;
2482
Alex Elder5214ecc2012-02-02 08:13:30 -06002483 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2484 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002485 if (IS_ERR(rbd_dev->rbd_client)) {
2486 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002487 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002488 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002489 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002490
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002491 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002492 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002493 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2494 if (rc < 0)
2495 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002496 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002497
2498 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002499 rc = register_blkdev(0, rbd_dev->name);
2500 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002501 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002502 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002503
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002504 rc = rbd_bus_add_dev(rbd_dev);
2505 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002506 goto err_out_blkdev;
2507
Alex Elder32eec682012-02-08 16:11:14 -06002508 /*
2509 * At this point cleanup in the event of an error is the job
2510 * of the sysfs code (initiated by rbd_bus_del_dev()).
2511 *
2512 * Set up and announce blkdev mapping.
2513 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002514 rc = rbd_init_disk(rbd_dev);
2515 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002516 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002517
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002518 rc = rbd_init_watch_dev(rbd_dev);
2519 if (rc)
2520 goto err_out_bus;
2521
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002522 return count;
2523
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002524err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002525 /* this will also clean up rest of rbd_dev stuff */
2526
2527 rbd_bus_del_dev(rbd_dev);
2528 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002529 return rc;
2530
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531err_out_blkdev:
2532 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2533err_out_client:
2534 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002535err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002536 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002537 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002538 kfree(rbd_dev->header_name);
2539 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002540 kfree(rbd_dev->pool_name);
2541 }
Alex Elder499afd52012-02-02 08:13:29 -06002542 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002543err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002544 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002545 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002546
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002547 dout("Error adding device %s\n", buf);
2548 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002549
2550 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002551}
2552
Alex Elderde71a292012-07-03 16:01:19 -05002553static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002554{
2555 struct list_head *tmp;
2556 struct rbd_device *rbd_dev;
2557
Alex Eldere124a822012-01-29 13:57:44 -06002558 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002559 list_for_each(tmp, &rbd_dev_list) {
2560 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002561 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002562 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002564 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002565 }
Alex Eldere124a822012-01-29 13:57:44 -06002566 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567 return NULL;
2568}
2569
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002570static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002571{
Alex Elder593a9e72012-02-07 12:03:37 -06002572 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002573
Alex Elder1dbb4392012-01-24 10:08:37 -06002574 if (rbd_dev->watch_request) {
2575 struct ceph_client *client = rbd_dev->rbd_client->client;
2576
2577 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002578 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002579 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002580 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002581 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002582
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583 rbd_put_client(rbd_dev);
2584
2585 /* clean up and free blkdev */
2586 rbd_free_disk(rbd_dev);
2587 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002588
2589 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002590 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002591 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002592 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002593 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002594 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002595 kfree(rbd_dev);
2596
2597 /* release module ref */
2598 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002599}
2600
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002601static ssize_t rbd_remove(struct bus_type *bus,
2602 const char *buf,
2603 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002604{
2605 struct rbd_device *rbd_dev = NULL;
2606 int target_id, rc;
2607 unsigned long ul;
2608 int ret = count;
2609
2610 rc = strict_strtoul(buf, 10, &ul);
2611 if (rc)
2612 return rc;
2613
2614 /* convert to int; abort if we lost anything in the conversion */
2615 target_id = (int) ul;
2616 if (target_id != ul)
2617 return -EINVAL;
2618
2619 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2620
2621 rbd_dev = __rbd_get_dev(target_id);
2622 if (!rbd_dev) {
2623 ret = -ENOENT;
2624 goto done;
2625 }
2626
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002627 __rbd_remove_all_snaps(rbd_dev);
2628 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629
2630done:
2631 mutex_unlock(&ctl_mutex);
2632 return ret;
2633}
2634
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002635static ssize_t rbd_snap_add(struct device *dev,
2636 struct device_attribute *attr,
2637 const char *buf,
2638 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002639{
Alex Elder593a9e72012-02-07 12:03:37 -06002640 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002641 int ret;
2642 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002643 if (!name)
2644 return -ENOMEM;
2645
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002646 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002647
2648 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2649
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002650 ret = rbd_header_add_snap(rbd_dev,
2651 name, GFP_KERNEL);
2652 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002653 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002654
Alex Elderb8136232012-07-25 09:32:41 -05002655 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002657 goto err_unlock;
2658
2659 /* shouldn't hold ctl_mutex when notifying.. notify might
2660 trigger a watch callback that would need to get that mutex */
2661 mutex_unlock(&ctl_mutex);
2662
2663 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002664 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002665
2666 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002667 kfree(name);
2668 return ret;
2669
2670err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002672 kfree(name);
2673 return ret;
2674}
2675
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002676/*
2677 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002678 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002679 */
2680static int rbd_sysfs_init(void)
2681{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002682 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002683
Alex Elderfed4c142012-02-07 12:03:36 -06002684 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002685 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002686 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002687
Alex Elderfed4c142012-02-07 12:03:36 -06002688 ret = bus_register(&rbd_bus_type);
2689 if (ret < 0)
2690 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002692 return ret;
2693}
2694
2695static void rbd_sysfs_cleanup(void)
2696{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002698 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699}
2700
2701int __init rbd_init(void)
2702{
2703 int rc;
2704
2705 rc = rbd_sysfs_init();
2706 if (rc)
2707 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002708 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002709 return 0;
2710}
2711
2712void __exit rbd_exit(void)
2713{
2714 rbd_sysfs_cleanup();
2715}
2716
2717module_init(rbd_init);
2718module_exit(rbd_exit);
2719
2720MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2721MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2722MODULE_DESCRIPTION("rados block device");
2723
2724/* following authorship retained from original osdblk.c */
2725MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2726
2727MODULE_LICENSE("GPL");