blob: 34f46c3b188ff72ed6aee3e766f13b8cdbd89deb [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder589d30e2012-07-10 20:30:11 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70
Alex Elder81a89792012-02-02 08:13:30 -060071/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070077#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060078#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070079
Alex Eldercc0538b2012-08-10 13:12:07 -070080#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070081
Yehuda Sadeh602adf42010-08-12 16:11:25 -070082/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050086 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050087 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088 __u8 obj_order;
89 __u8 crypt_type;
90 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Alex Elderf84344f2012-08-31 17:29:51 -050092 /* The remaining fields need to be updated occasionally */
93 u64 image_size;
94 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095 char *snap_names;
96 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097
98 u64 obj_version;
99};
100
101struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700102 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103};
104
105/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600106 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107 */
108struct rbd_client {
109 struct ceph_client *client;
110 struct kref kref;
111 struct list_head node;
112};
113
114/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600115 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700116 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700117struct rbd_req_status {
118 int done;
119 int rc;
120 u64 bytes;
121};
122
123/*
124 * a collection of requests
125 */
126struct rbd_req_coll {
127 int total;
128 int num_done;
129 struct kref kref;
130 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700131};
132
Alex Elderf0f8cef2012-01-29 13:57:44 -0600133/*
134 * a single io request
135 */
136struct rbd_request {
137 struct request *rq; /* blk layer request */
138 struct bio *bio; /* cloned bio */
139 struct page **pages; /* list of used pages */
140 u64 len;
141 int coll_index;
142 struct rbd_req_coll *coll;
143};
144
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800145struct rbd_snap {
146 struct device dev;
147 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800148 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800149 struct list_head node;
150 u64 id;
151};
152
Alex Elderf84344f2012-08-31 17:29:51 -0500153struct rbd_mapping {
154 char *snap_name;
155 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500156 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500157 bool snap_exists;
158 bool read_only;
159};
160
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700161/*
162 * a single device
163 */
164struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500165 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700166
167 int major; /* blkdev assigned major */
168 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Alex Elderf8c38922012-08-10 13:12:07 -0700170 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700171 struct rbd_client *rbd_client;
172
173 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
174
175 spinlock_t lock; /* queue lock */
176
177 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500178 char *image_id;
179 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500180 char *image_name;
181 size_t image_name_len;
182 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500183 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500184 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700185
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700186 struct ceph_osd_event *watch_event;
187 struct ceph_osd_request *watch_request;
188
Josh Durginc6666012011-11-21 17:11:12 -0800189 /* protects updating the header */
190 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500191
192 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193
194 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800195
196 /* list of snapshots */
197 struct list_head snaps;
198
199 /* sysfs related */
200 struct device dev;
201};
202
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600204
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700205static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600206static DEFINE_SPINLOCK(rbd_dev_list_lock);
207
Alex Elder432b8582012-01-29 13:57:44 -0600208static LIST_HEAD(rbd_client_list); /* clients */
209static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
Alex Elder304f6802012-08-31 17:29:52 -0500211static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
212static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
213
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800214static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800215static ssize_t rbd_snap_add(struct device *dev,
216 struct device_attribute *attr,
217 const char *buf,
218 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500219static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800220
Alex Elderf0f8cef2012-01-29 13:57:44 -0600221static ssize_t rbd_add(struct bus_type *bus, const char *buf,
222 size_t count);
223static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
224 size_t count);
225
226static struct bus_attribute rbd_bus_attrs[] = {
227 __ATTR(add, S_IWUSR, NULL, rbd_add),
228 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
229 __ATTR_NULL
230};
231
232static struct bus_type rbd_bus_type = {
233 .name = "rbd",
234 .bus_attrs = rbd_bus_attrs,
235};
236
237static void rbd_root_dev_release(struct device *dev)
238{
239}
240
241static struct device rbd_root_dev = {
242 .init_name = "rbd",
243 .release = rbd_root_dev_release,
244};
245
Alex Elderaafb2302012-09-06 16:00:54 -0500246#ifdef RBD_DEBUG
247#define rbd_assert(expr) \
248 if (unlikely(!(expr))) { \
249 printk(KERN_ERR "\nAssertion failure in %s() " \
250 "at line %d:\n\n" \
251 "\trbd_assert(%s);\n\n", \
252 __func__, __LINE__, #expr); \
253 BUG(); \
254 }
255#else /* !RBD_DEBUG */
256# define rbd_assert(expr) ((void) 0)
257#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
260{
261 return get_device(&rbd_dev->dev);
262}
263
264static void rbd_put_dev(struct rbd_device *rbd_dev)
265{
266 put_device(&rbd_dev->dev);
267}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268
Alex Elder1fe5e992012-07-25 09:32:41 -0500269static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700270
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271static int rbd_open(struct block_device *bdev, fmode_t mode)
272{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600273 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700274
Alex Elderf84344f2012-08-31 17:29:51 -0500275 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 return -EROFS;
277
Alex Elder340c7a22012-08-10 13:12:07 -0700278 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500279 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700280
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281 return 0;
282}
283
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800284static int rbd_release(struct gendisk *disk, fmode_t mode)
285{
286 struct rbd_device *rbd_dev = disk->private_data;
287
288 rbd_put_dev(rbd_dev);
289
290 return 0;
291}
292
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293static const struct block_device_operations rbd_bd_ops = {
294 .owner = THIS_MODULE,
295 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800296 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297};
298
299/*
300 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500301 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302 */
Alex Elderf8c38922012-08-10 13:12:07 -0700303static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304{
305 struct rbd_client *rbdc;
306 int ret = -ENOMEM;
307
308 dout("rbd_client_create\n");
309 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
310 if (!rbdc)
311 goto out_opt;
312
313 kref_init(&rbdc->kref);
314 INIT_LIST_HEAD(&rbdc->node);
315
Alex Elderbc534d862012-01-29 13:57:44 -0600316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317
Alex Elder43ae4702012-07-03 16:01:18 -0500318 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319 if (IS_ERR(rbdc->client))
Alex Elderbc534d862012-01-29 13:57:44 -0600320 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500321 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322
323 ret = ceph_open_session(rbdc->client);
324 if (ret < 0)
325 goto out_err;
326
Alex Elder432b8582012-01-29 13:57:44 -0600327 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600329 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330
Alex Elderbc534d862012-01-29 13:57:44 -0600331 mutex_unlock(&ctl_mutex);
332
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 dout("rbd_client_create created %p\n", rbdc);
334 return rbdc;
335
336out_err:
337 ceph_destroy_client(rbdc->client);
Alex Elderbc534d862012-01-29 13:57:44 -0600338out_mutex:
339 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700340 kfree(rbdc);
341out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500342 if (ceph_opts)
343 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400344 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345}
346
347/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700348 * Find a ceph client with specific addr and configuration. If
349 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700351static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352{
353 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700354 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700355
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700357 return NULL;
358
Alex Elder1f7ba332012-08-10 13:12:07 -0700359 spin_lock(&rbd_client_list_lock);
360 list_for_each_entry(client_node, &rbd_client_list, node) {
361 if (!ceph_compare_options(ceph_opts, client_node->client)) {
362 kref_get(&client_node->kref);
363 found = true;
364 break;
365 }
366 }
367 spin_unlock(&rbd_client_list_lock);
368
369 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370}
371
372/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700373 * mount options
374 */
375enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700376 Opt_last_int,
377 /* int args above */
378 Opt_last_string,
379 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700380 Opt_read_only,
381 Opt_read_write,
382 /* Boolean args above */
383 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700384};
385
Alex Elder43ae4702012-07-03 16:01:18 -0500386static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 /* int args above */
388 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500389 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700390 {Opt_read_only, "ro"}, /* Alternate spelling */
391 {Opt_read_write, "read_write"},
392 {Opt_read_write, "rw"}, /* Alternate spelling */
393 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394 {-1, NULL}
395};
396
397static int parse_rbd_opts_token(char *c, void *private)
398{
Alex Elder43ae4702012-07-03 16:01:18 -0500399 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700400 substring_t argstr[MAX_OPT_ARGS];
401 int token, intval, ret;
402
Alex Elder43ae4702012-07-03 16:01:18 -0500403 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 if (token < 0)
405 return -EINVAL;
406
407 if (token < Opt_last_int) {
408 ret = match_int(&argstr[0], &intval);
409 if (ret < 0) {
410 pr_err("bad mount option arg (not int) "
411 "at '%s'\n", c);
412 return ret;
413 }
414 dout("got int token %d val %d\n", token, intval);
415 } else if (token > Opt_last_int && token < Opt_last_string) {
416 dout("got string token %d val %s\n", token,
417 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700418 } else if (token > Opt_last_string && token < Opt_last_bool) {
419 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700420 } else {
421 dout("got token %d\n", token);
422 }
423
424 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700425 case Opt_read_only:
426 rbd_opts->read_only = true;
427 break;
428 case Opt_read_write:
429 rbd_opts->read_only = false;
430 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700431 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500432 rbd_assert(false);
433 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 }
435 return 0;
436}
437
438/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439 * Get a ceph client with specific addr and configuration, if one does
440 * not exist create it.
441 */
Alex Elderf8c38922012-08-10 13:12:07 -0700442static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
443 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444{
Alex Elderf8c38922012-08-10 13:12:07 -0700445 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500446 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700447 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448
Alex Eldercc0538b2012-08-10 13:12:07 -0700449 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder43ae4702012-07-03 16:01:18 -0500451 ceph_opts = ceph_parse_options(options, mon_addr,
452 mon_addr + mon_addr_len,
453 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700454 if (IS_ERR(ceph_opts))
455 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
Alex Elder1f7ba332012-08-10 13:12:07 -0700457 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600459 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500460 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700461 } else {
462 rbdc = rbd_client_create(ceph_opts);
463 if (IS_ERR(rbdc))
464 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465 }
Alex Elderf8c38922012-08-10 13:12:07 -0700466 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467
Alex Elderf8c38922012-08-10 13:12:07 -0700468 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469}
470
471/*
472 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600473 *
Alex Elder432b8582012-01-29 13:57:44 -0600474 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700475 */
476static void rbd_client_release(struct kref *kref)
477{
478 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
479
480 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500481 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700482 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500483 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
485 ceph_destroy_client(rbdc->client);
486 kfree(rbdc);
487}
488
489/*
490 * Drop reference to ceph client node. If it's not referenced anymore, release
491 * it.
492 */
493static void rbd_put_client(struct rbd_device *rbd_dev)
494{
495 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
496 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497}
498
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700499/*
500 * Destroy requests collection
501 */
502static void rbd_coll_release(struct kref *kref)
503{
504 struct rbd_req_coll *coll =
505 container_of(kref, struct rbd_req_coll, kref);
506
507 dout("rbd_coll_release %p\n", coll);
508 kfree(coll);
509}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510
Alex Elder8e94af82012-07-25 09:32:40 -0500511static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
512{
Alex Elder103a1502012-08-02 11:29:45 -0500513 size_t size;
514 u32 snap_count;
515
516 /* The header has to start with the magic rbd header text */
517 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
518 return false;
519
520 /*
521 * The size of a snapshot header has to fit in a size_t, and
522 * that limits the number of snapshots.
523 */
524 snap_count = le32_to_cpu(ondisk->snap_count);
525 size = SIZE_MAX - sizeof (struct ceph_snap_context);
526 if (snap_count > size / sizeof (__le64))
527 return false;
528
529 /*
530 * Not only that, but the size of the entire the snapshot
531 * header must also be representable in a size_t.
532 */
533 size -= snap_count * sizeof (__le64);
534 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
535 return false;
536
537 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500538}
539
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540/*
541 * Create a new header structure, translate header format from the on-disk
542 * header.
543 */
544static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500545 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546{
Alex Elderccece232012-07-10 20:30:10 -0500547 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500548 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500549 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500550 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551
Alex Elder6a523252012-07-19 17:12:59 -0500552 memset(header, 0, sizeof (*header));
553
Alex Elder103a1502012-08-02 11:29:45 -0500554 snap_count = le32_to_cpu(ondisk->snap_count);
555
Alex Elder58c17b02012-08-23 23:22:06 -0500556 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
557 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500558 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500560 memcpy(header->object_prefix, ondisk->object_prefix, len);
561 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600562
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500564 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
565
Alex Elder621901d2012-08-23 23:22:06 -0500566 /* Save a copy of the snapshot names */
567
Alex Elderf785cc12012-08-23 23:22:06 -0500568 if (snap_names_len > (u64) SIZE_MAX)
569 return -EIO;
570 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500572 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500573 /*
574 * Note that rbd_dev_v1_header_read() guarantees
575 * the ondisk buffer we're working with has
576 * snap_names_len bytes beyond the end of the
577 * snapshot id array, this memcpy() is safe.
578 */
579 memcpy(header->snap_names, &ondisk->snaps[snap_count],
580 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500581
Alex Elder621901d2012-08-23 23:22:06 -0500582 /* Record each snapshot's size */
583
Alex Elderd2bb24e2012-07-26 23:37:14 -0500584 size = snap_count * sizeof (*header->snap_sizes);
585 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700586 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500587 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500588 for (i = 0; i < snap_count; i++)
589 header->snap_sizes[i] =
590 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 } else {
Alex Elderccece232012-07-10 20:30:10 -0500592 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593 header->snap_names = NULL;
594 header->snap_sizes = NULL;
595 }
Alex Elder849b4262012-07-09 21:04:24 -0500596
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 header->obj_order = ondisk->options.order;
598 header->crypt_type = ondisk->options.crypt_type;
599 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Allocate and fill in the snapshot context */
602
Alex Elderf84344f2012-08-31 17:29:51 -0500603 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500604 size = sizeof (struct ceph_snap_context);
605 size += snap_count * sizeof (header->snapc->snaps[0]);
606 header->snapc = kzalloc(size, GFP_KERNEL);
607 if (!header->snapc)
608 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609
610 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500611 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500613 for (i = 0; i < snap_count; i++)
614 header->snapc->snaps[i] =
615 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616
617 return 0;
618
Alex Elder6a523252012-07-19 17:12:59 -0500619out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500620 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500621 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500623 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500624 kfree(header->object_prefix);
625 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500626
Alex Elder00f1f362012-02-07 12:03:36 -0600627 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628}
629
Alex Elder8836b992012-08-30 14:42:15 -0500630static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632
Alex Eldere86924a2012-07-10 20:30:11 -0500633 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600634
Alex Eldere86924a2012-07-10 20:30:11 -0500635 list_for_each_entry(snap, &rbd_dev->snaps, node) {
636 if (!strcmp(snap_name, snap->name)) {
637 rbd_dev->mapping.snap_id = snap->id;
638 rbd_dev->mapping.size = snap->size;
Alex Elder00f1f362012-02-07 12:03:36 -0600639
Alex Eldere86924a2012-07-10 20:30:11 -0500640 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600641 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 }
Alex Eldere86924a2012-07-10 20:30:11 -0500643
Alex Elder00f1f362012-02-07 12:03:36 -0600644 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645}
646
Alex Elder5ed16172012-08-29 17:11:07 -0500647static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648{
Alex Elder78dc4472012-07-19 08:49:18 -0500649 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650
Alex Elder4e1105a2012-08-31 17:29:52 -0500651 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800652 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500653 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500654 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elderf84344f2012-08-31 17:29:51 -0500655 rbd_dev->mapping.snap_exists = false;
656 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500657 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500659 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 if (ret < 0)
661 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500662 rbd_dev->mapping.snap_exists = true;
663 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500665 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 return ret;
668}
669
670static void rbd_header_free(struct rbd_image_header *header)
671{
Alex Elder849b4262012-07-09 21:04:24 -0500672 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500673 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500675 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500676 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500677 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800678 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500679 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680}
681
Alex Elder65ccfe22012-08-09 10:33:26 -0700682static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683{
Alex Elder65ccfe22012-08-09 10:33:26 -0700684 char *name;
685 u64 segment;
686 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687
Alex Elder65ccfe22012-08-09 10:33:26 -0700688 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
689 if (!name)
690 return NULL;
691 segment = offset >> rbd_dev->header.obj_order;
692 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
693 rbd_dev->header.object_prefix, segment);
694 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
695 pr_err("error formatting segment name for #%llu (%d)\n",
696 segment, ret);
697 kfree(name);
698 name = NULL;
699 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700
Alex Elder65ccfe22012-08-09 10:33:26 -0700701 return name;
702}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703
Alex Elder65ccfe22012-08-09 10:33:26 -0700704static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
705{
706 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707
Alex Elder65ccfe22012-08-09 10:33:26 -0700708 return offset & (segment_size - 1);
709}
710
711static u64 rbd_segment_length(struct rbd_device *rbd_dev,
712 u64 offset, u64 length)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
715
716 offset &= segment_size - 1;
717
Alex Elderaafb2302012-09-06 16:00:54 -0500718 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700719 if (offset + length > segment_size)
720 length = segment_size - offset;
721
722 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723}
724
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700725static int rbd_get_num_segments(struct rbd_image_header *header,
726 u64 ofs, u64 len)
727{
Alex Elderdf111be2012-08-09 10:33:26 -0700728 u64 start_seg;
729 u64 end_seg;
730
731 if (!len)
732 return 0;
733 if (len - 1 > U64_MAX - ofs)
734 return -ERANGE;
735
736 start_seg = ofs >> header->obj_order;
737 end_seg = (ofs + len - 1) >> header->obj_order;
738
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700739 return end_seg - start_seg + 1;
740}
741
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700743 * returns the size of an object in the image
744 */
745static u64 rbd_obj_bytes(struct rbd_image_header *header)
746{
747 return 1 << header->obj_order;
748}
749
750/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751 * bio helpers
752 */
753
754static void bio_chain_put(struct bio *chain)
755{
756 struct bio *tmp;
757
758 while (chain) {
759 tmp = chain;
760 chain = chain->bi_next;
761 bio_put(tmp);
762 }
763}
764
765/*
766 * zeros a bio chain, starting at specific offset
767 */
768static void zero_bio_chain(struct bio *chain, int start_ofs)
769{
770 struct bio_vec *bv;
771 unsigned long flags;
772 void *buf;
773 int i;
774 int pos = 0;
775
776 while (chain) {
777 bio_for_each_segment(bv, chain, i) {
778 if (pos + bv->bv_len > start_ofs) {
779 int remainder = max(start_ofs - pos, 0);
780 buf = bvec_kmap_irq(bv, &flags);
781 memset(buf + remainder, 0,
782 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200783 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 }
785 pos += bv->bv_len;
786 }
787
788 chain = chain->bi_next;
789 }
790}
791
792/*
793 * bio_chain_clone - clone a chain of bios up to a certain length.
794 * might return a bio_pair that will need to be released.
795 */
796static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
797 struct bio_pair **bp,
798 int len, gfp_t gfpmask)
799{
Alex Elder542582f2012-08-09 10:33:25 -0700800 struct bio *old_chain = *old;
801 struct bio *new_chain = NULL;
802 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 int total = 0;
804
805 if (*bp) {
806 bio_pair_release(*bp);
807 *bp = NULL;
808 }
809
810 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700811 struct bio *tmp;
812
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
814 if (!tmp)
815 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700816 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700817
818 if (total + old_chain->bi_size > len) {
819 struct bio_pair *bp;
820
821 /*
822 * this split can only happen with a single paged bio,
823 * split_bio will BUG_ON if this is not the case
824 */
825 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500826 "bi_size=%u\n",
827 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828
829 /* split the bio. We'll release it either in the next
830 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600831 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700832 if (!bp)
833 goto err_out;
834
835 __bio_clone(tmp, &bp->bio1);
836
837 *next = &bp->bio2;
838 } else {
839 __bio_clone(tmp, old_chain);
840 *next = old_chain->bi_next;
841 }
842
843 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700844 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700845 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700846 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700847 else
848 new_chain = tmp;
849 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850 old_chain = old_chain->bi_next;
851
852 total += tmp->bi_size;
853 }
854
Alex Elderaafb2302012-09-06 16:00:54 -0500855 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700856
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857 *old = old_chain;
858
859 return new_chain;
860
861err_out:
862 dout("bio_chain_clone with err\n");
863 bio_chain_put(new_chain);
864 return NULL;
865}
866
867/*
868 * helpers for osd request op vectors.
869 */
Alex Elder57cfc102012-06-26 12:57:03 -0700870static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
871 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872{
Alex Elder57cfc102012-06-26 12:57:03 -0700873 struct ceph_osd_req_op *ops;
874
875 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
876 if (!ops)
877 return NULL;
878
879 ops[0].op = opcode;
880
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881 /*
882 * op extent offset and length will be set later on
883 * in calc_raw_layout()
884 */
Alex Elder57cfc102012-06-26 12:57:03 -0700885 ops[0].payload_len = payload_len;
886
887 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888}
889
890static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
891{
892 kfree(ops);
893}
894
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700895static void rbd_coll_end_req_index(struct request *rq,
896 struct rbd_req_coll *coll,
897 int index,
898 int ret, u64 len)
899{
900 struct request_queue *q;
901 int min, max, i;
902
Alex Elderbd919d42012-07-13 20:35:11 -0500903 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
904 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700905
906 if (!rq)
907 return;
908
909 if (!coll) {
910 blk_end_request(rq, ret, len);
911 return;
912 }
913
914 q = rq->q;
915
916 spin_lock_irq(q->queue_lock);
917 coll->status[index].done = 1;
918 coll->status[index].rc = ret;
919 coll->status[index].bytes = len;
920 max = min = coll->num_done;
921 while (max < coll->total && coll->status[max].done)
922 max++;
923
924 for (i = min; i<max; i++) {
925 __blk_end_request(rq, coll->status[i].rc,
926 coll->status[i].bytes);
927 coll->num_done++;
928 kref_put(&coll->kref, rbd_coll_release);
929 }
930 spin_unlock_irq(q->queue_lock);
931}
932
933static void rbd_coll_end_req(struct rbd_request *req,
934 int ret, u64 len)
935{
936 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
937}
938
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939/*
940 * Send ceph osd request
941 */
942static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500943 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700944 struct ceph_snap_context *snapc,
945 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500946 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 struct bio *bio,
948 struct page **pages,
949 int num_pages,
950 int flags,
951 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700952 struct rbd_req_coll *coll,
953 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700954 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700955 struct ceph_msg *msg),
956 struct ceph_osd_request **linger_req,
957 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958{
959 struct ceph_osd_request *req;
960 struct ceph_file_layout *layout;
961 int ret;
962 u64 bno;
963 struct timespec mtime = CURRENT_TIME;
964 struct rbd_request *req_data;
965 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600966 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700969 if (!req_data) {
970 if (coll)
971 rbd_coll_end_req_index(rq, coll, coll_index,
972 -ENOMEM, len);
973 return -ENOMEM;
974 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700976 if (coll) {
977 req_data->coll = coll;
978 req_data->coll_index = coll_index;
979 }
980
Alex Elderbd919d42012-07-13 20:35:11 -0500981 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
982 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983
Alex Elder0ce1a792012-07-03 16:01:18 -0500984 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600985 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
986 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700987 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700988 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989 goto done_pages;
990 }
991
992 req->r_callback = rbd_cb;
993
994 req_data->rq = rq;
995 req_data->bio = bio;
996 req_data->pages = pages;
997 req_data->len = len;
998
999 req->r_priv = req_data;
1000
1001 reqhead = req->r_request->front.iov_base;
1002 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1003
Alex Elderaded07e2012-07-03 16:01:18 -05001004 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 req->r_oid_len = strlen(req->r_oid);
1006
1007 layout = &req->r_file_layout;
1008 memset(layout, 0, sizeof(*layout));
1009 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1010 layout->fl_stripe_count = cpu_to_le32(1);
1011 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001012 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001013 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1014 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
1016 ceph_osdc_build_request(req, ofs, &len,
1017 ops,
1018 snapc,
1019 &mtime,
1020 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001022 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001023 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001024 *linger_req = req;
1025 }
1026
Alex Elder1dbb4392012-01-24 10:08:37 -06001027 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028 if (ret < 0)
1029 goto done_err;
1030
1031 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001032 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001033 if (ver)
1034 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001035 dout("reassert_ver=%llu\n",
1036 (unsigned long long)
1037 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001038 ceph_osdc_put_request(req);
1039 }
1040 return ret;
1041
1042done_err:
1043 bio_chain_put(req_data->bio);
1044 ceph_osdc_put_request(req);
1045done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001046 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 return ret;
1049}
1050
1051/*
1052 * Ceph osd op callback
1053 */
1054static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1055{
1056 struct rbd_request *req_data = req->r_priv;
1057 struct ceph_osd_reply_head *replyhead;
1058 struct ceph_osd_op *op;
1059 __s32 rc;
1060 u64 bytes;
1061 int read_op;
1062
1063 /* parse reply */
1064 replyhead = msg->front.iov_base;
1065 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1066 op = (void *)(replyhead + 1);
1067 rc = le32_to_cpu(replyhead->result);
1068 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001069 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001070
Alex Elderbd919d42012-07-13 20:35:11 -05001071 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1072 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001073
1074 if (rc == -ENOENT && read_op) {
1075 zero_bio_chain(req_data->bio, 0);
1076 rc = 0;
1077 } else if (rc == 0 && read_op && bytes < req_data->len) {
1078 zero_bio_chain(req_data->bio, bytes);
1079 bytes = req_data->len;
1080 }
1081
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001082 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083
1084 if (req_data->bio)
1085 bio_chain_put(req_data->bio);
1086
1087 ceph_osdc_put_request(req);
1088 kfree(req_data);
1089}
1090
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001091static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1092{
1093 ceph_osdc_put_request(req);
1094}
1095
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001096/*
1097 * Do a synchronous ceph osd operation
1098 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001099static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001100 struct ceph_snap_context *snapc,
1101 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001103 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001104 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001105 u64 ofs, u64 inbound_size,
1106 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001107 struct ceph_osd_request **linger_req,
1108 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109{
1110 int ret;
1111 struct page **pages;
1112 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001113
Alex Elderaafb2302012-09-06 16:00:54 -05001114 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001115
Alex Elderf8d4de62012-07-03 16:01:19 -05001116 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001118 if (IS_ERR(pages))
1119 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001120
Alex Elder0ce1a792012-07-03 16:01:18 -05001121 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001122 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 pages, num_pages,
1124 flags,
1125 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001126 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001127 NULL,
1128 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001130 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131
Alex Elderf8d4de62012-07-03 16:01:19 -05001132 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1133 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001135done:
1136 ceph_release_page_vector(pages, num_pages);
1137 return ret;
1138}
1139
1140/*
1141 * Do an asynchronous ceph osd operation
1142 */
1143static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001144 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145 struct ceph_snap_context *snapc,
1146 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001147 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001149 struct bio *bio,
1150 struct rbd_req_coll *coll,
1151 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152{
1153 char *seg_name;
1154 u64 seg_ofs;
1155 u64 seg_len;
1156 int ret;
1157 struct ceph_osd_req_op *ops;
1158 u32 payload_len;
1159
Alex Elder65ccfe22012-08-09 10:33:26 -07001160 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161 if (!seg_name)
1162 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001163 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1164 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001165
1166 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1167
Alex Elder57cfc102012-06-26 12:57:03 -07001168 ret = -ENOMEM;
1169 ops = rbd_create_rw_ops(1, opcode, payload_len);
1170 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 goto done;
1172
1173 /* we've taken care of segment sizes earlier when we
1174 cloned the bios. We should never have a segment
1175 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001176 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177
1178 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1179 seg_name, seg_ofs, seg_len,
1180 bio,
1181 NULL, 0,
1182 flags,
1183 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001184 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001185 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001186
1187 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188done:
1189 kfree(seg_name);
1190 return ret;
1191}
1192
1193/*
1194 * Request async osd write
1195 */
1196static int rbd_req_write(struct request *rq,
1197 struct rbd_device *rbd_dev,
1198 struct ceph_snap_context *snapc,
1199 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001200 struct bio *bio,
1201 struct rbd_req_coll *coll,
1202 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203{
1204 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1205 CEPH_OSD_OP_WRITE,
1206 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001207 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208}
1209
1210/*
1211 * Request async osd read
1212 */
1213static int rbd_req_read(struct request *rq,
1214 struct rbd_device *rbd_dev,
1215 u64 snapid,
1216 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001217 struct bio *bio,
1218 struct rbd_req_coll *coll,
1219 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220{
1221 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001222 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223 CEPH_OSD_OP_READ,
1224 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001225 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226}
1227
1228/*
1229 * Request sync osd read
1230 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001231static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001233 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001235 char *buf,
1236 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237{
Alex Elder913d2fd2012-06-26 12:57:03 -07001238 struct ceph_osd_req_op *ops;
1239 int ret;
1240
1241 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1242 if (!ops)
1243 return -ENOMEM;
1244
1245 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001246 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001248 ops, object_name, ofs, len, buf, NULL, ver);
1249 rbd_destroy_ops(ops);
1250
1251 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252}
1253
1254/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001255 * Request sync osd watch
1256 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001257static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001258 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001259 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260{
1261 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001262 int ret;
1263
Alex Elder57cfc102012-06-26 12:57:03 -07001264 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1265 if (!ops)
1266 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001267
Josh Durgina71b8912011-12-05 18:10:44 -08001268 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269 ops[0].watch.cookie = notify_id;
1270 ops[0].watch.flag = 0;
1271
Alex Elder0ce1a792012-07-03 16:01:18 -05001272 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001273 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001274 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001275 CEPH_OSD_FLAG_READ,
1276 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001277 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 rbd_simple_req_cb, 0, NULL);
1279
1280 rbd_destroy_ops(ops);
1281 return ret;
1282}
1283
1284static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1285{
Alex Elder0ce1a792012-07-03 16:01:18 -05001286 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001287 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001288 int rc;
1289
Alex Elder0ce1a792012-07-03 16:01:18 -05001290 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001291 return;
1292
Alex Elderbd919d42012-07-13 20:35:11 -05001293 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1294 rbd_dev->header_name, (unsigned long long) notify_id,
1295 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001296 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001297 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001298 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300
Alex Elder7f0a24d2012-07-25 09:32:40 -05001301 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302}
1303
1304/*
1305 * Request sync osd watch
1306 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001307static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001308{
1309 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001310 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001311 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312
Alex Elder57cfc102012-06-26 12:57:03 -07001313 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1314 if (!ops)
1315 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001316
1317 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001318 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001319 if (ret < 0)
1320 goto fail;
1321
Alex Elder0e6f3222012-07-25 09:32:40 -05001322 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001323 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 ops[0].watch.flag = 1;
1325
Alex Elder0ce1a792012-07-03 16:01:18 -05001326 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001327 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1329 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001330 rbd_dev->header_name,
1331 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333
1334 if (ret < 0)
1335 goto fail_event;
1336
1337 rbd_destroy_ops(ops);
1338 return 0;
1339
1340fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001341 ceph_osdc_cancel_event(rbd_dev->watch_event);
1342 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001343fail:
1344 rbd_destroy_ops(ops);
1345 return ret;
1346}
1347
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001348/*
1349 * Request sync osd unwatch
1350 */
Alex Elder070c6332012-07-25 09:32:41 -05001351static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001352{
1353 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001354 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001355
Alex Elder57cfc102012-06-26 12:57:03 -07001356 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1357 if (!ops)
1358 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001359
1360 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001361 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001362 ops[0].watch.flag = 0;
1363
Alex Elder0ce1a792012-07-03 16:01:18 -05001364 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001365 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001366 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1367 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001368 rbd_dev->header_name,
1369 0, 0, NULL, NULL, NULL);
1370
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001371
1372 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 ceph_osdc_cancel_event(rbd_dev->watch_event);
1374 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 return ret;
1376}
1377
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001378struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001379 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001380};
1381
1382static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1383{
Alex Elder0ce1a792012-07-03 16:01:18 -05001384 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1385 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 return;
1387
Alex Elderbd919d42012-07-13 20:35:11 -05001388 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1389 rbd_dev->header_name, (unsigned long long) notify_id,
1390 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001391}
1392
1393/*
1394 * Request sync osd notify
1395 */
Alex Elder4cb16252012-07-25 09:32:40 -05001396static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001397{
1398 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001399 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001400 struct ceph_osd_event *event;
1401 struct rbd_notify_info info;
1402 int payload_len = sizeof(u32) + sizeof(u32);
1403 int ret;
1404
Alex Elder57cfc102012-06-26 12:57:03 -07001405 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1406 if (!ops)
1407 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001408
Alex Elder0ce1a792012-07-03 16:01:18 -05001409 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001410
1411 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1412 (void *)&info, &event);
1413 if (ret < 0)
1414 goto fail;
1415
1416 ops[0].watch.ver = 1;
1417 ops[0].watch.flag = 1;
1418 ops[0].watch.cookie = event->cookie;
1419 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1420 ops[0].watch.timeout = 12;
1421
Alex Elder0ce1a792012-07-03 16:01:18 -05001422 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001423 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001424 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1425 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001426 rbd_dev->header_name,
1427 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001428 if (ret < 0)
1429 goto fail_event;
1430
1431 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1432 dout("ceph_osdc_wait_event returned %d\n", ret);
1433 rbd_destroy_ops(ops);
1434 return 0;
1435
1436fail_event:
1437 ceph_osdc_cancel_event(event);
1438fail:
1439 rbd_destroy_ops(ops);
1440 return ret;
1441}
1442
1443/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001444 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001446static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001447 const char *object_name,
1448 const char *class_name,
1449 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001450 const char *outbound,
1451 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001452 char *inbound,
1453 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001454 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001455 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456{
1457 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001458 int class_name_len = strlen(class_name);
1459 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001460 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001461 int ret;
1462
Alex Elder3cb4a682012-06-26 12:57:03 -07001463 /*
1464 * Any input parameters required by the method we're calling
1465 * will be sent along with the class and method names as
1466 * part of the message payload. That data and its size are
1467 * supplied via the indata and indata_len fields (named from
1468 * the perspective of the server side) in the OSD request
1469 * operation.
1470 */
1471 payload_size = class_name_len + method_name_len + outbound_size;
1472 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001473 if (!ops)
1474 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001475
Alex Elderaded07e2012-07-03 16:01:18 -05001476 ops[0].cls.class_name = class_name;
1477 ops[0].cls.class_len = (__u8) class_name_len;
1478 ops[0].cls.method_name = method_name;
1479 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001481 ops[0].cls.indata = outbound;
1482 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001483
Alex Elder0ce1a792012-07-03 16:01:18 -05001484 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001485 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001486 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001487 object_name, 0, inbound_size, inbound,
1488 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489
1490 rbd_destroy_ops(ops);
1491
1492 dout("cls_exec returned %d\n", ret);
1493 return ret;
1494}
1495
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001496static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1497{
1498 struct rbd_req_coll *coll =
1499 kzalloc(sizeof(struct rbd_req_coll) +
1500 sizeof(struct rbd_req_status) * num_reqs,
1501 GFP_ATOMIC);
1502
1503 if (!coll)
1504 return NULL;
1505 coll->total = num_reqs;
1506 kref_init(&coll->kref);
1507 return coll;
1508}
1509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510/*
1511 * block device queue callback
1512 */
1513static void rbd_rq_fn(struct request_queue *q)
1514{
1515 struct rbd_device *rbd_dev = q->queuedata;
1516 struct request *rq;
1517 struct bio_pair *bp = NULL;
1518
Alex Elder00f1f362012-02-07 12:03:36 -06001519 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 struct bio *bio;
1521 struct bio *rq_bio, *next_bio = NULL;
1522 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001523 unsigned int size;
1524 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001526 int num_segs, cur_seg = 0;
1527 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001528 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 dout("fetched request\n");
1531
1532 /* filter out block requests we don't understand */
1533 if ((rq->cmd_type != REQ_TYPE_FS)) {
1534 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001535 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001536 }
1537
1538 /* deduce our operation (read, write) */
1539 do_write = (rq_data_dir(rq) == WRITE);
1540
1541 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001542 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001544 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001545 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001546 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 }
1548
1549 spin_unlock_irq(q->queue_lock);
1550
Josh Durgind1d25642011-12-05 14:03:05 -08001551 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001552
Alex Elderf84344f2012-08-31 17:29:51 -05001553 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1554 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001555 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001556 dout("request for non-existent snapshot");
1557 spin_lock_irq(q->queue_lock);
1558 __blk_end_request_all(rq, -ENXIO);
1559 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001560 }
1561
Josh Durgind1d25642011-12-05 14:03:05 -08001562 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1563
1564 up_read(&rbd_dev->header_rwsem);
1565
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 dout("%s 0x%x bytes at 0x%llx\n",
1567 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001568 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001570 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001571 if (num_segs <= 0) {
1572 spin_lock_irq(q->queue_lock);
1573 __blk_end_request_all(rq, num_segs);
1574 ceph_put_snap_context(snapc);
1575 continue;
1576 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001577 coll = rbd_alloc_coll(num_segs);
1578 if (!coll) {
1579 spin_lock_irq(q->queue_lock);
1580 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001581 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001582 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001583 }
1584
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 do {
1586 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001587 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001588 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001589 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1591 op_size, GFP_ATOMIC);
1592 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001593 rbd_coll_end_req_index(rq, coll, cur_seg,
1594 -ENOMEM, op_size);
1595 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 }
1597
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001598
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 /* init OSD command: write or read */
1600 if (do_write)
1601 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001602 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001603 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001604 op_size, bio,
1605 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001606 else
1607 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001608 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001610 op_size, bio,
1611 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001613next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614 size -= op_size;
1615 ofs += op_size;
1616
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001617 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 rq_bio = next_bio;
1619 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001620 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001621
1622 if (bp)
1623 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001625
1626 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 }
1628}
1629
1630/*
1631 * a queue callback. Makes sure that we don't create a bio that spans across
1632 * multiple osd objects. One exception would be with a single page bios,
1633 * which we handle later at bio_chain_clone
1634 */
1635static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1636 struct bio_vec *bvec)
1637{
1638 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001639 unsigned int chunk_sectors;
1640 sector_t sector;
1641 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642 int max;
1643
Alex Elder593a9e72012-02-07 12:03:37 -06001644 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1645 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1646 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1647
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001648 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001649 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650 if (max < 0)
1651 max = 0; /* bio_add cannot handle a negative return */
1652 if (max <= bvec->bv_len && bio_sectors == 0)
1653 return bvec->bv_len;
1654 return max;
1655}
1656
1657static void rbd_free_disk(struct rbd_device *rbd_dev)
1658{
1659 struct gendisk *disk = rbd_dev->disk;
1660
1661 if (!disk)
1662 return;
1663
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001664 if (disk->flags & GENHD_FL_UP)
1665 del_gendisk(disk);
1666 if (disk->queue)
1667 blk_cleanup_queue(disk->queue);
1668 put_disk(disk);
1669}
1670
1671/*
Alex Elder4156d992012-08-02 11:29:46 -05001672 * Read the complete header for the given rbd device.
1673 *
1674 * Returns a pointer to a dynamically-allocated buffer containing
1675 * the complete and validated header. Caller can pass the address
1676 * of a variable that will be filled in with the version of the
1677 * header object at the time it was read.
1678 *
1679 * Returns a pointer-coded errno if a failure occurs.
1680 */
1681static struct rbd_image_header_ondisk *
1682rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1683{
1684 struct rbd_image_header_ondisk *ondisk = NULL;
1685 u32 snap_count = 0;
1686 u64 names_size = 0;
1687 u32 want_count;
1688 int ret;
1689
1690 /*
1691 * The complete header will include an array of its 64-bit
1692 * snapshot ids, followed by the names of those snapshots as
1693 * a contiguous block of NUL-terminated strings. Note that
1694 * the number of snapshots could change by the time we read
1695 * it in, in which case we re-read it.
1696 */
1697 do {
1698 size_t size;
1699
1700 kfree(ondisk);
1701
1702 size = sizeof (*ondisk);
1703 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1704 size += names_size;
1705 ondisk = kmalloc(size, GFP_KERNEL);
1706 if (!ondisk)
1707 return ERR_PTR(-ENOMEM);
1708
1709 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1710 rbd_dev->header_name,
1711 0, size,
1712 (char *) ondisk, version);
1713
1714 if (ret < 0)
1715 goto out_err;
1716 if (WARN_ON((size_t) ret < size)) {
1717 ret = -ENXIO;
1718 pr_warning("short header read for image %s"
1719 " (want %zd got %d)\n",
1720 rbd_dev->image_name, size, ret);
1721 goto out_err;
1722 }
1723 if (!rbd_dev_ondisk_valid(ondisk)) {
1724 ret = -ENXIO;
1725 pr_warning("invalid header for image %s\n",
1726 rbd_dev->image_name);
1727 goto out_err;
1728 }
1729
1730 names_size = le64_to_cpu(ondisk->snap_names_len);
1731 want_count = snap_count;
1732 snap_count = le32_to_cpu(ondisk->snap_count);
1733 } while (snap_count != want_count);
1734
1735 return ondisk;
1736
1737out_err:
1738 kfree(ondisk);
1739
1740 return ERR_PTR(ret);
1741}
1742
1743/*
1744 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001745 */
1746static int rbd_read_header(struct rbd_device *rbd_dev,
1747 struct rbd_image_header *header)
1748{
Alex Elder4156d992012-08-02 11:29:46 -05001749 struct rbd_image_header_ondisk *ondisk;
1750 u64 ver = 0;
1751 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Alex Elder4156d992012-08-02 11:29:46 -05001753 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1754 if (IS_ERR(ondisk))
1755 return PTR_ERR(ondisk);
1756 ret = rbd_header_from_disk(header, ondisk);
1757 if (ret >= 0)
1758 header->obj_version = ver;
1759 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001760
Alex Elder4156d992012-08-02 11:29:46 -05001761 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762}
1763
1764/*
1765 * create a snapshot
1766 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001767static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001768 const char *snap_name,
1769 gfp_t gfp_flags)
1770{
1771 int name_len = strlen(snap_name);
1772 u64 new_snapid;
1773 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001774 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001775 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001776
1777 /* we should create a snapshot only if we're pointing at the head */
Alex Elderf84344f2012-08-31 17:29:51 -05001778 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001779 return -EINVAL;
1780
Alex Elder0ce1a792012-07-03 16:01:18 -05001781 monc = &rbd_dev->rbd_client->client->monc;
1782 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001783 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001784 if (ret < 0)
1785 return ret;
1786
1787 data = kmalloc(name_len + 16, gfp_flags);
1788 if (!data)
1789 return -ENOMEM;
1790
Sage Weil916d4d62011-05-12 16:10:50 -07001791 p = data;
1792 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001793
Sage Weil916d4d62011-05-12 16:10:50 -07001794 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1795 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001796
Alex Elder0bed54d2012-07-03 16:01:18 -05001797 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001798 "rbd", "snap_add",
Alex Elderf8d4de62012-07-03 16:01:19 -05001799 data, (size_t) (p - data), NULL, 0,
Alex Elder3cb4a682012-06-26 12:57:03 -07001800 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1801 NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001802
Sage Weil916d4d62011-05-12 16:10:50 -07001803 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001804
Alex Elder505cbb92012-07-19 08:49:18 -05001805 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001806bad:
1807 return -ERANGE;
1808}
1809
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001810static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1811{
1812 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001813 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001814
Alex Eldera0593292012-07-19 09:09:27 -05001815 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001816 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001817}
1818
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819/*
1820 * only read the first part of the ondisk header, without the snaps info
1821 */
Alex Elderb8136232012-07-25 09:32:41 -05001822static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823{
1824 int ret;
1825 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826
1827 ret = rbd_read_header(rbd_dev, &h);
1828 if (ret < 0)
1829 return ret;
1830
Josh Durgina51aa0c2011-12-05 10:35:04 -08001831 down_write(&rbd_dev->header_rwsem);
1832
Sage Weil9db4b3e2011-04-19 22:49:06 -07001833 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001834 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001835 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1836
Alex Elder99c1f082012-08-30 14:42:15 -05001837 if (size != (sector_t) rbd_dev->mapping.size) {
1838 dout("setting size to %llu sectors",
1839 (unsigned long long) size);
1840 rbd_dev->mapping.size = (u64) size;
1841 set_capacity(rbd_dev->disk, size);
1842 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001843 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001844
Alex Elder849b4262012-07-09 21:04:24 -05001845 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001846 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001847 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001848 /* osd requests may still refer to snapc */
1849 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001850
Alex Elderb8136232012-07-25 09:32:41 -05001851 if (hver)
1852 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001853 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001854 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855 rbd_dev->header.snapc = h.snapc;
1856 rbd_dev->header.snap_names = h.snap_names;
1857 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001858 /* Free the extra copy of the object prefix */
1859 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1860 kfree(h.object_prefix);
1861
Alex Elder304f6802012-08-31 17:29:52 -05001862 ret = rbd_dev_snaps_update(rbd_dev);
1863 if (!ret)
1864 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001865
Josh Durginc6666012011-11-21 17:11:12 -08001866 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001867
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001868 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001869}
1870
Alex Elder1fe5e992012-07-25 09:32:41 -05001871static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1872{
1873 int ret;
1874
1875 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1876 ret = __rbd_refresh_header(rbd_dev, hver);
1877 mutex_unlock(&ctl_mutex);
1878
1879 return ret;
1880}
1881
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001882static int rbd_init_disk(struct rbd_device *rbd_dev)
1883{
1884 struct gendisk *disk;
1885 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001886 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001887
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001888 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001889 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1890 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001891 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001892
Alex Elderf0f8cef2012-01-29 13:57:44 -06001893 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001894 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001895 disk->major = rbd_dev->major;
1896 disk->first_minor = 0;
1897 disk->fops = &rbd_bd_ops;
1898 disk->private_data = rbd_dev;
1899
1900 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001901 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1902 if (!q)
1903 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001904
Alex Elder593a9e72012-02-07 12:03:37 -06001905 /* We use the default size, but let's be explicit about it. */
1906 blk_queue_physical_block_size(q, SECTOR_SIZE);
1907
Josh Durgin029bcbd2011-07-22 11:35:23 -07001908 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001909 segment_size = rbd_obj_bytes(&rbd_dev->header);
1910 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1911 blk_queue_max_segment_size(q, segment_size);
1912 blk_queue_io_min(q, segment_size);
1913 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001914
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915 blk_queue_merge_bvec(q, rbd_merge_bvec);
1916 disk->queue = q;
1917
1918 q->queuedata = rbd_dev;
1919
1920 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001921
Alex Elder12f02942012-08-29 17:11:07 -05001922 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1923
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001925out_disk:
1926 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001927
1928 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001929}
1930
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001931/*
1932 sysfs
1933*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001934
Alex Elder593a9e72012-02-07 12:03:37 -06001935static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1936{
1937 return container_of(dev, struct rbd_device, dev);
1938}
1939
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940static ssize_t rbd_size_show(struct device *dev,
1941 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001942{
Alex Elder593a9e72012-02-07 12:03:37 -06001943 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001944 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001945
Josh Durgina51aa0c2011-12-05 10:35:04 -08001946 down_read(&rbd_dev->header_rwsem);
1947 size = get_capacity(rbd_dev->disk);
1948 up_read(&rbd_dev->header_rwsem);
1949
1950 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001951}
1952
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953static ssize_t rbd_major_show(struct device *dev,
1954 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001955{
Alex Elder593a9e72012-02-07 12:03:37 -06001956 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957
1958 return sprintf(buf, "%d\n", rbd_dev->major);
1959}
1960
1961static ssize_t rbd_client_id_show(struct device *dev,
1962 struct device_attribute *attr, char *buf)
1963{
Alex Elder593a9e72012-02-07 12:03:37 -06001964 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001965
Alex Elder1dbb4392012-01-24 10:08:37 -06001966 return sprintf(buf, "client%lld\n",
1967 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968}
1969
1970static ssize_t rbd_pool_show(struct device *dev,
1971 struct device_attribute *attr, char *buf)
1972{
Alex Elder593a9e72012-02-07 12:03:37 -06001973 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974
1975 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1976}
1977
Alex Elder9bb2f332012-07-12 10:46:35 -05001978static ssize_t rbd_pool_id_show(struct device *dev,
1979 struct device_attribute *attr, char *buf)
1980{
1981 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1982
1983 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1984}
1985
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001986static ssize_t rbd_name_show(struct device *dev,
1987 struct device_attribute *attr, char *buf)
1988{
Alex Elder593a9e72012-02-07 12:03:37 -06001989 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990
Alex Elder0bed54d2012-07-03 16:01:18 -05001991 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992}
1993
Alex Elder589d30e2012-07-10 20:30:11 -05001994static ssize_t rbd_image_id_show(struct device *dev,
1995 struct device_attribute *attr, char *buf)
1996{
1997 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1998
1999 return sprintf(buf, "%s\n", rbd_dev->image_id);
2000}
2001
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002002static ssize_t rbd_snap_show(struct device *dev,
2003 struct device_attribute *attr,
2004 char *buf)
2005{
Alex Elder593a9e72012-02-07 12:03:37 -06002006 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002007
Alex Elderf84344f2012-08-31 17:29:51 -05002008 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009}
2010
2011static ssize_t rbd_image_refresh(struct device *dev,
2012 struct device_attribute *attr,
2013 const char *buf,
2014 size_t size)
2015{
Alex Elder593a9e72012-02-07 12:03:37 -06002016 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002017 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002018
Alex Elder1fe5e992012-07-25 09:32:41 -05002019 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002020
2021 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002023
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2025static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2026static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2027static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002028static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002029static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002030static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2032static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2033static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034
2035static struct attribute *rbd_attrs[] = {
2036 &dev_attr_size.attr,
2037 &dev_attr_major.attr,
2038 &dev_attr_client_id.attr,
2039 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002040 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002041 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002042 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002043 &dev_attr_current_snap.attr,
2044 &dev_attr_refresh.attr,
2045 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046 NULL
2047};
2048
2049static struct attribute_group rbd_attr_group = {
2050 .attrs = rbd_attrs,
2051};
2052
2053static const struct attribute_group *rbd_attr_groups[] = {
2054 &rbd_attr_group,
2055 NULL
2056};
2057
2058static void rbd_sysfs_dev_release(struct device *dev)
2059{
2060}
2061
2062static struct device_type rbd_device_type = {
2063 .name = "rbd",
2064 .groups = rbd_attr_groups,
2065 .release = rbd_sysfs_dev_release,
2066};
2067
2068
2069/*
2070 sysfs - snapshots
2071*/
2072
2073static ssize_t rbd_snap_size_show(struct device *dev,
2074 struct device_attribute *attr,
2075 char *buf)
2076{
2077 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2078
Josh Durgin35915382011-12-05 18:25:13 -08002079 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002080}
2081
2082static ssize_t rbd_snap_id_show(struct device *dev,
2083 struct device_attribute *attr,
2084 char *buf)
2085{
2086 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2087
Josh Durgin35915382011-12-05 18:25:13 -08002088 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002089}
2090
2091static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2092static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2093
2094static struct attribute *rbd_snap_attrs[] = {
2095 &dev_attr_snap_size.attr,
2096 &dev_attr_snap_id.attr,
2097 NULL,
2098};
2099
2100static struct attribute_group rbd_snap_attr_group = {
2101 .attrs = rbd_snap_attrs,
2102};
2103
2104static void rbd_snap_dev_release(struct device *dev)
2105{
2106 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2107 kfree(snap->name);
2108 kfree(snap);
2109}
2110
2111static const struct attribute_group *rbd_snap_attr_groups[] = {
2112 &rbd_snap_attr_group,
2113 NULL
2114};
2115
2116static struct device_type rbd_snap_device_type = {
2117 .groups = rbd_snap_attr_groups,
2118 .release = rbd_snap_dev_release,
2119};
2120
Alex Elder304f6802012-08-31 17:29:52 -05002121static bool rbd_snap_registered(struct rbd_snap *snap)
2122{
2123 bool ret = snap->dev.type == &rbd_snap_device_type;
2124 bool reg = device_is_registered(&snap->dev);
2125
2126 rbd_assert(!ret ^ reg);
2127
2128 return ret;
2129}
2130
Alex Elder14e70852012-07-19 09:09:27 -05002131static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002132{
2133 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002134 if (device_is_registered(&snap->dev))
2135 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002136}
2137
Alex Elder14e70852012-07-19 09:09:27 -05002138static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139 struct device *parent)
2140{
2141 struct device *dev = &snap->dev;
2142 int ret;
2143
2144 dev->type = &rbd_snap_device_type;
2145 dev->parent = parent;
2146 dev->release = rbd_snap_dev_release;
2147 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002148 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2149
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002150 ret = device_register(dev);
2151
2152 return ret;
2153}
2154
Alex Elder4e891e02012-07-10 20:30:10 -05002155static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2156 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002157{
Alex Elder4e891e02012-07-10 20:30:10 -05002158 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002159 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002160
2161 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002163 return ERR_PTR(-ENOMEM);
2164
2165 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002167 if (!snap->name)
2168 goto err;
2169
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170 snap->size = rbd_dev->header.snap_sizes[i];
2171 snap->id = rbd_dev->header.snapc->snaps[i];
Alex Elder4e891e02012-07-10 20:30:10 -05002172
2173 return snap;
2174
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175err:
2176 kfree(snap->name);
2177 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002178
2179 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180}
2181
2182/*
Alex Elder35938152012-08-02 11:29:46 -05002183 * Scan the rbd device's current snapshot list and compare it to the
2184 * newly-received snapshot context. Remove any existing snapshots
2185 * not present in the new snapshot context. Add a new snapshot for
2186 * any snaphots in the snapshot context not in the current list.
2187 * And verify there are no changes to snapshots we already know
2188 * about.
2189 *
2190 * Assumes the snapshots in the snapshot context are sorted by
2191 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2192 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002193 */
Alex Elder304f6802012-08-31 17:29:52 -05002194static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002195{
Alex Elder35938152012-08-02 11:29:46 -05002196 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2197 const u32 snap_count = snapc->num_snaps;
2198 char *snap_name = rbd_dev->header.snap_names;
2199 struct list_head *head = &rbd_dev->snaps;
2200 struct list_head *links = head->next;
2201 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002202
Alex Elder9fcbb802012-08-23 23:48:49 -05002203 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002204 while (index < snap_count || links != head) {
2205 u64 snap_id;
2206 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002207
Alex Elder35938152012-08-02 11:29:46 -05002208 snap_id = index < snap_count ? snapc->snaps[index]
2209 : CEPH_NOSNAP;
2210 snap = links != head ? list_entry(links, struct rbd_snap, node)
2211 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002212 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002213
Alex Elder35938152012-08-02 11:29:46 -05002214 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2215 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002216
Alex Elder35938152012-08-02 11:29:46 -05002217 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002218
Alex Elderf84344f2012-08-31 17:29:51 -05002219 if (rbd_dev->mapping.snap_id == snap->id)
2220 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002221 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002222 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002223 rbd_dev->mapping.snap_id == snap->id ?
2224 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002225 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002226
Alex Elder35938152012-08-02 11:29:46 -05002227 /* Done with this list entry; advance */
2228
2229 links = next;
2230 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231 }
Alex Elder35938152012-08-02 11:29:46 -05002232
Alex Elder9fcbb802012-08-23 23:48:49 -05002233 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2234 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002235 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2236 struct rbd_snap *new_snap;
2237
2238 /* We haven't seen this snapshot before */
2239
2240 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2241 snap_name);
Alex Elder9fcbb802012-08-23 23:48:49 -05002242 if (IS_ERR(new_snap)) {
2243 int err = PTR_ERR(new_snap);
2244
2245 dout(" failed to add dev, error %d\n", err);
2246
2247 return err;
2248 }
Alex Elder35938152012-08-02 11:29:46 -05002249
2250 /* New goes before existing, or at end of list */
2251
Alex Elder9fcbb802012-08-23 23:48:49 -05002252 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002253 if (snap)
2254 list_add_tail(&new_snap->node, &snap->node);
2255 else
Alex Elder523f3252012-08-30 00:16:37 -05002256 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002257 } else {
2258 /* Already have this one */
2259
Alex Elder9fcbb802012-08-23 23:48:49 -05002260 dout(" already present\n");
2261
Alex Elderaafb2302012-09-06 16:00:54 -05002262 rbd_assert(snap->size ==
2263 rbd_dev->header.snap_sizes[index]);
2264 rbd_assert(!strcmp(snap->name, snap_name));
Alex Elder35938152012-08-02 11:29:46 -05002265
2266 /* Done with this list entry; advance */
2267
2268 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002269 }
Alex Elder35938152012-08-02 11:29:46 -05002270
2271 /* Advance to the next entry in the snapshot context */
2272
2273 index++;
2274 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002275 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002276 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002277
2278 return 0;
2279}
2280
Alex Elder304f6802012-08-31 17:29:52 -05002281/*
2282 * Scan the list of snapshots and register the devices for any that
2283 * have not already been registered.
2284 */
2285static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2286{
2287 struct rbd_snap *snap;
2288 int ret = 0;
2289
2290 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002291 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2292 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002293
2294 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2295 if (!rbd_snap_registered(snap)) {
2296 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2297 if (ret < 0)
2298 break;
2299 }
2300 }
2301 dout("%s: returning %d\n", __func__, ret);
2302
2303 return ret;
2304}
2305
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002306static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2307{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002308 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002309 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002310
2311 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002312
Alex Eldercd789ab2012-08-30 00:16:38 -05002313 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002314 dev->bus = &rbd_bus_type;
2315 dev->type = &rbd_device_type;
2316 dev->parent = &rbd_root_dev;
2317 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002318 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002319 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002320
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002321 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002322
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002323 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002324}
2325
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002326static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2327{
2328 device_unregister(&rbd_dev->dev);
2329}
2330
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002331static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2332{
2333 int ret, rc;
2334
2335 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002336 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002337 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002338 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002339 if (rc < 0)
2340 return rc;
2341 }
2342 } while (ret == -ERANGE);
2343
2344 return ret;
2345}
2346
Alex Eldere2839302012-08-29 17:11:06 -05002347static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002348
2349/*
Alex Elder499afd52012-02-02 08:13:29 -06002350 * Get a unique rbd identifier for the given new rbd_dev, and add
2351 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002352 */
Alex Eldere2839302012-08-29 17:11:06 -05002353static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002354{
Alex Eldere2839302012-08-29 17:11:06 -05002355 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002356
2357 spin_lock(&rbd_dev_list_lock);
2358 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2359 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002360 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2361 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002362}
Alex Elderb7f23c32012-01-29 13:57:43 -06002363
Alex Elder1ddbe942012-01-29 13:57:44 -06002364/*
Alex Elder499afd52012-02-02 08:13:29 -06002365 * Remove an rbd_dev from the global list, and record that its
2366 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002367 */
Alex Eldere2839302012-08-29 17:11:06 -05002368static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002369{
Alex Elderd184f6b2012-01-29 13:57:44 -06002370 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002371 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002372 int max_id;
2373
Alex Elderaafb2302012-09-06 16:00:54 -05002374 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002375
Alex Eldere2839302012-08-29 17:11:06 -05002376 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2377 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002378 spin_lock(&rbd_dev_list_lock);
2379 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002380
2381 /*
2382 * If the id being "put" is not the current maximum, there
2383 * is nothing special we need to do.
2384 */
Alex Eldere2839302012-08-29 17:11:06 -05002385 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002386 spin_unlock(&rbd_dev_list_lock);
2387 return;
2388 }
2389
2390 /*
2391 * We need to update the current maximum id. Search the
2392 * list to find out what it is. We're more likely to find
2393 * the maximum at the end, so search the list backward.
2394 */
2395 max_id = 0;
2396 list_for_each_prev(tmp, &rbd_dev_list) {
2397 struct rbd_device *rbd_dev;
2398
2399 rbd_dev = list_entry(tmp, struct rbd_device, node);
2400 if (rbd_id > max_id)
2401 max_id = rbd_id;
2402 }
Alex Elder499afd52012-02-02 08:13:29 -06002403 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002404
Alex Elder1ddbe942012-01-29 13:57:44 -06002405 /*
Alex Eldere2839302012-08-29 17:11:06 -05002406 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002407 * which case it now accurately reflects the new maximum.
2408 * Be careful not to overwrite the maximum value in that
2409 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002410 */
Alex Eldere2839302012-08-29 17:11:06 -05002411 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2412 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002413}
2414
Alex Eldera725f65e2012-02-02 08:13:30 -06002415/*
Alex Eldere28fff262012-02-02 08:13:30 -06002416 * Skips over white space at *buf, and updates *buf to point to the
2417 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002418 * the token (string of non-white space characters) found. Note
2419 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002420 */
2421static inline size_t next_token(const char **buf)
2422{
2423 /*
2424 * These are the characters that produce nonzero for
2425 * isspace() in the "C" and "POSIX" locales.
2426 */
2427 const char *spaces = " \f\n\r\t\v";
2428
2429 *buf += strspn(*buf, spaces); /* Find start of token */
2430
2431 return strcspn(*buf, spaces); /* Return token length */
2432}
2433
2434/*
2435 * Finds the next token in *buf, and if the provided token buffer is
2436 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002437 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2438 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002439 *
2440 * Returns the length of the token found (not including the '\0').
2441 * Return value will be 0 if no token is found, and it will be >=
2442 * token_size if the token would not fit.
2443 *
Alex Elder593a9e72012-02-07 12:03:37 -06002444 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002445 * found token. Note that this occurs even if the token buffer is
2446 * too small to hold it.
2447 */
2448static inline size_t copy_token(const char **buf,
2449 char *token,
2450 size_t token_size)
2451{
2452 size_t len;
2453
2454 len = next_token(buf);
2455 if (len < token_size) {
2456 memcpy(token, *buf, len);
2457 *(token + len) = '\0';
2458 }
2459 *buf += len;
2460
2461 return len;
2462}
2463
2464/*
Alex Elderea3352f2012-07-09 21:04:23 -05002465 * Finds the next token in *buf, dynamically allocates a buffer big
2466 * enough to hold a copy of it, and copies the token into the new
2467 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2468 * that a duplicate buffer is created even for a zero-length token.
2469 *
2470 * Returns a pointer to the newly-allocated duplicate, or a null
2471 * pointer if memory for the duplicate was not available. If
2472 * the lenp argument is a non-null pointer, the length of the token
2473 * (not including the '\0') is returned in *lenp.
2474 *
2475 * If successful, the *buf pointer will be updated to point beyond
2476 * the end of the found token.
2477 *
2478 * Note: uses GFP_KERNEL for allocation.
2479 */
2480static inline char *dup_token(const char **buf, size_t *lenp)
2481{
2482 char *dup;
2483 size_t len;
2484
2485 len = next_token(buf);
2486 dup = kmalloc(len + 1, GFP_KERNEL);
2487 if (!dup)
2488 return NULL;
2489
2490 memcpy(dup, *buf, len);
2491 *(dup + len) = '\0';
2492 *buf += len;
2493
2494 if (lenp)
2495 *lenp = len;
2496
2497 return dup;
2498}
2499
2500/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002501 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2502 * rbd_md_name, and name fields of the given rbd_dev, based on the
2503 * list of monitor addresses and other options provided via
2504 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2505 * copy of the snapshot name to map if successful, or a
2506 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002507 *
2508 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002509 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002510static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2511 const char *buf,
2512 const char **mon_addrs,
2513 size_t *mon_addrs_size,
2514 char *options,
2515 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002516{
Alex Elderd22f76e2012-07-12 10:46:35 -05002517 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002518 char *err_ptr = ERR_PTR(-EINVAL);
2519 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002520
2521 /* The first four tokens are required */
2522
Alex Elder7ef32142012-02-02 08:13:30 -06002523 len = next_token(&buf);
2524 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002525 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002526 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002527 *mon_addrs = buf;
2528
2529 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002530
Alex Eldere28fff262012-02-02 08:13:30 -06002531 len = copy_token(&buf, options, options_size);
2532 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002533 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002534
Alex Elder3feeb8942012-08-31 17:29:52 -05002535 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002536 rbd_dev->pool_name = dup_token(&buf, NULL);
2537 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002538 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002539
Alex Elder0bed54d2012-07-03 16:01:18 -05002540 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2541 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002542 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002543
Alex Elder3feeb8942012-08-31 17:29:52 -05002544 /* Snapshot name is optional */
2545 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002546 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002547 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2548 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002549 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002550 snap_name = kmalloc(len + 1, GFP_KERNEL);
2551 if (!snap_name)
2552 goto out_err;
2553 memcpy(snap_name, buf, len);
2554 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002555
Alex Elder3feeb8942012-08-31 17:29:52 -05002556dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2557
2558 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002559
2560out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002561 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002562 rbd_dev->image_name = NULL;
2563 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002564 kfree(rbd_dev->pool_name);
2565 rbd_dev->pool_name = NULL;
2566
Alex Elder3feeb8942012-08-31 17:29:52 -05002567 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002568}
2569
Alex Elder589d30e2012-07-10 20:30:11 -05002570/*
2571 * An rbd format 2 image has a unique identifier, distinct from the
2572 * name given to it by the user. Internally, that identifier is
2573 * what's used to specify the names of objects related to the image.
2574 *
2575 * A special "rbd id" object is used to map an rbd image name to its
2576 * id. If that object doesn't exist, then there is no v2 rbd image
2577 * with the supplied name.
2578 *
2579 * This function will record the given rbd_dev's image_id field if
2580 * it can be determined, and in that case will return 0. If any
2581 * errors occur a negative errno will be returned and the rbd_dev's
2582 * image_id field will be unchanged (and should be NULL).
2583 */
2584static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2585{
2586 int ret;
2587 size_t size;
2588 char *object_name;
2589 void *response;
2590 void *p;
2591
2592 /*
2593 * First, see if the format 2 image id file exists, and if
2594 * so, get the image's persistent id from it.
2595 */
2596 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2597 object_name = kmalloc(size, GFP_NOIO);
2598 if (!object_name)
2599 return -ENOMEM;
2600 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2601 dout("rbd id object name is %s\n", object_name);
2602
2603 /* Response will be an encoded string, which includes a length */
2604
2605 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2606 response = kzalloc(size, GFP_NOIO);
2607 if (!response) {
2608 ret = -ENOMEM;
2609 goto out;
2610 }
2611
2612 ret = rbd_req_sync_exec(rbd_dev, object_name,
2613 "rbd", "get_id",
2614 NULL, 0,
2615 response, RBD_IMAGE_ID_LEN_MAX,
2616 CEPH_OSD_FLAG_READ, NULL);
2617 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2618 if (ret < 0)
2619 goto out;
2620
2621 p = response;
2622 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2623 p + RBD_IMAGE_ID_LEN_MAX,
2624 &rbd_dev->image_id_len,
2625 GFP_NOIO);
2626 if (IS_ERR(rbd_dev->image_id)) {
2627 ret = PTR_ERR(rbd_dev->image_id);
2628 rbd_dev->image_id = NULL;
2629 } else {
2630 dout("image_id is %s\n", rbd_dev->image_id);
2631 }
2632out:
2633 kfree(response);
2634 kfree(object_name);
2635
2636 return ret;
2637}
2638
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002639static ssize_t rbd_add(struct bus_type *bus,
2640 const char *buf,
2641 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642{
Alex Eldercb8627c2012-07-09 21:04:23 -05002643 char *options;
2644 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002645 const char *mon_addrs = NULL;
2646 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002647 struct ceph_osd_client *osdc;
2648 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002649 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002650
2651 if (!try_module_get(THIS_MODULE))
2652 return -ENODEV;
2653
Alex Elder27cc2592012-02-02 08:13:30 -06002654 options = kmalloc(count, GFP_KERNEL);
2655 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002656 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002657 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2658 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002659 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660
2661 /* static rbd_device initialization */
2662 spin_lock_init(&rbd_dev->lock);
2663 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002664 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002665 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002666
Alex Eldera725f65e2012-02-02 08:13:30 -06002667 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002668 snap_name = rbd_add_parse_args(rbd_dev, buf,
2669 &mon_addrs, &mon_addrs_size, options, count);
2670 if (IS_ERR(snap_name)) {
2671 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002672 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002673 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002674
Alex Elderf8c38922012-08-10 13:12:07 -07002675 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2676 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002677 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002678
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002679 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002680 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2682 if (rc < 0)
2683 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002684 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002685
Alex Elder589d30e2012-07-10 20:30:11 -05002686 rc = rbd_dev_image_id(rbd_dev);
2687 if (!rc) {
2688 rc = -ENOTSUPP; /* Not actually supporting format 2 yet */
2689 goto err_out_client;
2690 }
2691
2692 /* Version 1 images have no id; empty string is used */
2693
2694 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2695 if (!rbd_dev->image_id) {
2696 rc = -ENOMEM;
2697 goto err_out_client;
2698 }
2699 rbd_dev->image_id_len = 0;
2700
Alex Elder3fcf2582012-07-03 16:01:19 -05002701 /* Create the name of the header object */
2702
2703 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2704 + sizeof (RBD_SUFFIX),
2705 GFP_KERNEL);
2706 if (!rbd_dev->header_name)
2707 goto err_out_client;
2708 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2709
Alex Elder05fd6f62012-08-29 17:11:07 -05002710 /* Get information about the image being mapped */
2711
2712 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2713 if (rc)
2714 goto err_out_client;
2715
2716 /* no need to lock here, as rbd_dev is not registered yet */
2717 rc = rbd_dev_snaps_update(rbd_dev);
2718 if (rc)
2719 goto err_out_header;
2720
2721 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2722 if (rc)
2723 goto err_out_header;
2724
Alex Elder85ae8922012-07-26 23:37:14 -05002725 /* generate unique id: find highest unique id, add one */
2726 rbd_dev_id_get(rbd_dev);
2727
2728 /* Fill in the device name, now that we have its id. */
2729 BUILD_BUG_ON(DEV_NAME_LEN
2730 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2731 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2732
2733 /* Get our block major device number. */
2734
Alex Elder27cc2592012-02-02 08:13:30 -06002735 rc = register_blkdev(0, rbd_dev->name);
2736 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002737 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002738 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002739
Alex Elder0f308a32012-08-29 17:11:07 -05002740 /* Set up the blkdev mapping. */
2741
2742 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002743 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002744 goto err_out_blkdev;
2745
Alex Elder0f308a32012-08-29 17:11:07 -05002746 rc = rbd_bus_add_dev(rbd_dev);
2747 if (rc)
2748 goto err_out_disk;
2749
Alex Elder32eec682012-02-08 16:11:14 -06002750 /*
2751 * At this point cleanup in the event of an error is the job
2752 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002753 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002754
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002755 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002756 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002757 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002758 if (rc)
2759 goto err_out_bus;
2760
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002761 rc = rbd_init_watch_dev(rbd_dev);
2762 if (rc)
2763 goto err_out_bus;
2764
Alex Elder3ee40012012-08-29 17:11:07 -05002765 /* Everything's ready. Announce the disk to the world. */
2766
2767 add_disk(rbd_dev->disk);
2768
2769 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2770 (unsigned long long) rbd_dev->mapping.size);
2771
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002772 return count;
2773
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002774err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002775 /* this will also clean up rest of rbd_dev stuff */
2776
2777 rbd_bus_del_dev(rbd_dev);
2778 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002779 return rc;
2780
Alex Elder0f308a32012-08-29 17:11:07 -05002781err_out_disk:
2782 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002783err_out_blkdev:
2784 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002785err_out_id:
2786 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002787err_out_header:
2788 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002789err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002790 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002791 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002792 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002793err_out_args:
2794 kfree(rbd_dev->mapping.snap_name);
2795 kfree(rbd_dev->image_name);
2796 kfree(rbd_dev->pool_name);
2797err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002798 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002799 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002800
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002801 dout("Error adding device %s\n", buf);
2802 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002803
2804 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002805}
2806
Alex Elderde71a292012-07-03 16:01:19 -05002807static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002808{
2809 struct list_head *tmp;
2810 struct rbd_device *rbd_dev;
2811
Alex Eldere124a822012-01-29 13:57:44 -06002812 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002813 list_for_each(tmp, &rbd_dev_list) {
2814 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002815 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002816 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002817 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002818 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002819 }
Alex Eldere124a822012-01-29 13:57:44 -06002820 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002821 return NULL;
2822}
2823
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002824static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002825{
Alex Elder593a9e72012-02-07 12:03:37 -06002826 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002827
Alex Elder1dbb4392012-01-24 10:08:37 -06002828 if (rbd_dev->watch_request) {
2829 struct ceph_client *client = rbd_dev->rbd_client->client;
2830
2831 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002832 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002833 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002834 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002835 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002836
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002837 rbd_put_client(rbd_dev);
2838
2839 /* clean up and free blkdev */
2840 rbd_free_disk(rbd_dev);
2841 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002842
Alex Elder2ac4e752012-07-10 20:30:10 -05002843 /* release allocated disk header fields */
2844 rbd_header_free(&rbd_dev->header);
2845
Alex Elder32eec682012-02-08 16:11:14 -06002846 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002847 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05002848 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05002849 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002850 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002851 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002852 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002853 kfree(rbd_dev);
2854
2855 /* release module ref */
2856 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002857}
2858
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002859static ssize_t rbd_remove(struct bus_type *bus,
2860 const char *buf,
2861 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002862{
2863 struct rbd_device *rbd_dev = NULL;
2864 int target_id, rc;
2865 unsigned long ul;
2866 int ret = count;
2867
2868 rc = strict_strtoul(buf, 10, &ul);
2869 if (rc)
2870 return rc;
2871
2872 /* convert to int; abort if we lost anything in the conversion */
2873 target_id = (int) ul;
2874 if (target_id != ul)
2875 return -EINVAL;
2876
2877 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2878
2879 rbd_dev = __rbd_get_dev(target_id);
2880 if (!rbd_dev) {
2881 ret = -ENOENT;
2882 goto done;
2883 }
2884
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002885 __rbd_remove_all_snaps(rbd_dev);
2886 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002887
2888done:
2889 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05002890
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002891 return ret;
2892}
2893
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002894static ssize_t rbd_snap_add(struct device *dev,
2895 struct device_attribute *attr,
2896 const char *buf,
2897 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002898{
Alex Elder593a9e72012-02-07 12:03:37 -06002899 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002900 int ret;
2901 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002902 if (!name)
2903 return -ENOMEM;
2904
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002905 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002906
2907 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2908
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002909 ret = rbd_header_add_snap(rbd_dev,
2910 name, GFP_KERNEL);
2911 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002912 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002913
Alex Elderb8136232012-07-25 09:32:41 -05002914 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002915 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002916 goto err_unlock;
2917
2918 /* shouldn't hold ctl_mutex when notifying.. notify might
2919 trigger a watch callback that would need to get that mutex */
2920 mutex_unlock(&ctl_mutex);
2921
2922 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002923 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002924
2925 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002926 kfree(name);
2927 return ret;
2928
2929err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002930 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002931 kfree(name);
2932 return ret;
2933}
2934
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002935/*
2936 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002937 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002938 */
2939static int rbd_sysfs_init(void)
2940{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002941 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002942
Alex Elderfed4c142012-02-07 12:03:36 -06002943 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002944 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002945 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002946
Alex Elderfed4c142012-02-07 12:03:36 -06002947 ret = bus_register(&rbd_bus_type);
2948 if (ret < 0)
2949 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002950
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002951 return ret;
2952}
2953
2954static void rbd_sysfs_cleanup(void)
2955{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002956 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002957 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002958}
2959
2960int __init rbd_init(void)
2961{
2962 int rc;
2963
2964 rc = rbd_sysfs_init();
2965 if (rc)
2966 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002967 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002968 return 0;
2969}
2970
2971void __exit rbd_exit(void)
2972{
2973 rbd_sysfs_cleanup();
2974}
2975
2976module_init(rbd_init);
2977module_exit(rbd_exit);
2978
2979MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2980MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2981MODULE_DESCRIPTION("rados block device");
2982
2983/* following authorship retained from original osdblk.c */
2984MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2985
2986MODULE_LICENSE("GPL");