Commit 275220f0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-2.6.38/core' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.38/core' of git://git.kernel.dk/linux-2.6-block: (43 commits)
  block: ensure that completion error gets properly traced
  blktrace: add missing probe argument to block_bio_complete
  block cfq: don't use atomic_t for cfq_group
  block cfq: don't use atomic_t for cfq_queue
  block: trace event block fix unassigned field
  block: add internal hd part table references
  block: fix accounting bug on cross partition merges
  kref: add kref_test_and_get
  bio-integrity: mark kintegrityd_wq highpri and CPU intensive
  block: make kblockd_workqueue smarter
  Revert "sd: implement sd_check_events()"
  block: Clean up exit_io_context() source code.
  Fix compile warnings due to missing removal of a 'ret' variable
  fs/block: type signature of major_to_index(int) to major_to_index(unsigned)
  block: convert !IS_ERR(p) && p to !IS_ERR_NOR_NULL(p)
  cfq-iosched: don't check cfqg in choose_service_tree()
  fs/splice: Pull buf->ops->confirm() from splice_from_pipe actors
  cdrom: export cdrom_check_events()
  sd: implement sd_check_events()
  sr: implement sr_check_events()
  ...
parents fe3c560b 81c5e2ae
......@@ -89,6 +89,33 @@ Throttling/Upper Limit policy
Limits for writes can be put using blkio.write_bps_device file.
Hierarchical Cgroups
====================
- Currently none of the IO control policy supports hierarhical groups. But
cgroup interface does allow creation of hierarhical cgroups and internally
IO policies treat them as flat hierarchy.
So this patch will allow creation of cgroup hierarhcy but at the backend
everything will be treated as flat. So if somebody created a hierarchy like
as follows.
root
/ \
test1 test2
|
test3
CFQ and throttling will practically treat all groups at same level.
pivot
/ | \ \
root test1 test2 test3
Down the line we can implement hierarchical accounting/control support
and also introduce a new cgroup file "use_hierarchy" which will control
whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
This is how memory controller also has implemented the things.
Various user visible config options
===================================
CONFIG_BLK_CGROUP
......
......@@ -1452,10 +1452,6 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
goto done;
}
/* Currently we do not support hierarchy deeper than two level (0,1) */
if (parent != cgroup->top_cgroup)
return ERR_PTR(-EPERM);
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
if (!blkcg)
return ERR_PTR(-ENOMEM);
......
......@@ -33,7 +33,7 @@
#include "blk.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
......@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
return;
cpu = part_stat_lock();
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!new_io)
if (!new_io) {
part = rq->part;
part_stat_inc(cpu, part, merges[rw]);
else {
} else {
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
* The partition is already being removed,
* the request will be accounted on the disk only
*
* We take a reference on disk->part0 although that
* partition will never be deleted, so we can treat
* it as any other partition.
*/
part = &rq->rq_disk->part0;
hd_struct_get(part);
}
part_round_stats(cpu, part);
part_inc_in_flight(part, rw);
rq->part = part;
}
part_stat_unlock();
......@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->ref_count = 1;
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
}
EXPORT_SYMBOL(blk_rq_init);
......@@ -1329,9 +1344,9 @@ static inline void blk_partition_remap(struct bio *bio)
bio->bi_sector += p->start_sect;
bio->bi_bdev = bdev->bd_contains;
trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
bdev->bd_dev,
bio->bi_sector - p->start_sect);
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
bdev->bd_dev,
bio->bi_sector - p->start_sect);
}
}
......@@ -1500,7 +1515,7 @@ static inline void __generic_make_request(struct bio *bio)
goto end_io;
if (old_sector != -1)
trace_block_remap(q, bio, old_dev, old_sector);
trace_block_bio_remap(q, bio, old_dev, old_sector);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
......@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
int cpu;
cpu = part_stat_lock();
part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
part = req->part;
part_stat_add(cpu, part, sectors[rw], bytes >> 9);
part_stat_unlock();
}
......@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct request *req)
int cpu;
cpu = part_stat_lock();
part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
part = req->part;
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
part_dec_in_flight(part, rw);
hd_struct_put(part);
part_stat_unlock();
}
}
......@@ -2606,7 +2622,9 @@ int __init blk_dev_init(void)
BUILD_BUG_ON(__REQ_NR_BITS > 8 *
sizeof(((struct request *)0)->cmd_flags));
kblockd_workqueue = create_workqueue("kblockd");
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
......
......@@ -64,7 +64,7 @@ static void cfq_exit(struct io_context *ioc)
rcu_read_unlock();
}
/* Called by the exitting task */
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
......@@ -74,10 +74,9 @@ void exit_io_context(struct task_struct *task)
task->io_context = NULL;
task_unlock(task);
if (atomic_dec_and_test(&ioc->nr_tasks)) {
if (atomic_dec_and_test(&ioc->nr_tasks))
cfq_exit(ioc);
}
put_io_context(ioc);
}
......
......@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct request *req)
int cpu;
cpu = part_stat_lock();
part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
part = req->part;
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
}
}
......
......@@ -87,7 +87,6 @@ struct cfq_rb_root {
unsigned count;
unsigned total_weight;
u64 min_vdisktime;
struct rb_node *active;
};
#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
.count = 0, .min_vdisktime = 0, }
......@@ -97,7 +96,7 @@ struct cfq_rb_root {
*/
struct cfq_queue {
/* reference count */
atomic_t ref;
int ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
......@@ -180,7 +179,6 @@ struct cfq_group {
/* group service_tree key */
u64 vdisktime;
unsigned int weight;
bool on_st;
/* number of cfqq currently on this group */
int nr_cfqq;
......@@ -209,7 +207,7 @@ struct cfq_group {
struct blkio_group blkg;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
struct hlist_node cfqd_node;
atomic_t ref;
int ref;
#endif
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
......@@ -563,11 +561,6 @@ static void update_min_vdisktime(struct cfq_rb_root *st)
u64 vdisktime = st->min_vdisktime;
struct cfq_group *cfqg;
if (st->active) {
cfqg = rb_entry_cfqg(st->active);
vdisktime = cfqg->vdisktime;
}
if (st->left) {
cfqg = rb_entry_cfqg(st->left);
vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
......@@ -646,11 +639,11 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
{
if (cfq_cfqq_slice_new(cfqq))
return 0;
return false;
if (time_before(jiffies, cfqq->slice_end))
return 0;
return false;
return 1;
return true;
}
/*
......@@ -869,7 +862,7 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
struct rb_node *n;
cfqg->nr_cfqq++;
if (cfqg->on_st)
if (!RB_EMPTY_NODE(&cfqg->rb_node))
return;
/*
......@@ -885,7 +878,6 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
cfqg->vdisktime = st->min_vdisktime;
__cfq_group_service_tree_add(st, cfqg);
cfqg->on_st = true;
st->total_weight += cfqg->weight;
}
......@@ -894,9 +886,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
if (st->active == &cfqg->rb_node)
st->active = NULL;
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
......@@ -905,7 +894,6 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
return;
cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
cfqg->on_st = false;
st->total_weight -= cfqg->weight;
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
......@@ -1026,7 +1014,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
* elevator which will be dropped by either elevator exit
* or cgroup deletion path depending on who is exiting first.
*/
atomic_set(&cfqg->ref, 1);
cfqg->ref = 1;
/*
* Add group onto cgroup list. It might happen that bdi->dev is
......@@ -1071,7 +1059,7 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
{
atomic_inc(&cfqg->ref);
cfqg->ref++;
return cfqg;
}
......@@ -1083,7 +1071,7 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
cfqq->cfqg = cfqg;
/* cfqq reference on cfqg */
atomic_inc(&cfqq->cfqg->ref);
cfqq->cfqg->ref++;
}
static void cfq_put_cfqg(struct cfq_group *cfqg)
......@@ -1091,11 +1079,12 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
struct cfq_rb_root *st;
int i, j;
BUG_ON(atomic_read(&cfqg->ref) <= 0);
if (!atomic_dec_and_test(&cfqg->ref))
BUG_ON(cfqg->ref <= 0);
cfqg->ref--;
if (cfqg->ref)
return;
for_each_cfqg_st(cfqg, i, j, st)
BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
BUG_ON(!RB_EMPTY_ROOT(&st->rb));
kfree(cfqg);
}
......@@ -1200,7 +1189,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfq_group_service_tree_del(cfqd, cfqq->cfqg);
cfqq->orig_cfqg = cfqq->cfqg;
cfqq->cfqg = &cfqd->root_group;
atomic_inc(&cfqd->root_group.ref);
cfqd->root_group.ref++;
group_changed = 1;
} else if (!cfqd->cfq_group_isolation
&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
......@@ -1687,9 +1676,6 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfqq == cfqd->active_queue)
cfqd->active_queue = NULL;
if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
cfqd->grp_service_tree.active = NULL;
if (cfqd->active_cic) {
put_io_context(cfqd->active_cic->ioc);
cfqd->active_cic = NULL;
......@@ -1901,10 +1887,10 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* in their service tree.
*/
if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
return 1;
return true;
cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
service_tree->count);
return 0;
return false;
}
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
......@@ -2040,7 +2026,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq)
int process_refs, io_refs;
io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
process_refs = atomic_read(&cfqq->ref) - io_refs;
process_refs = cfqq->ref - io_refs;
BUG_ON(process_refs < 0);
return process_refs;
}
......@@ -2080,10 +2066,10 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
*/
if (new_process_refs >= process_refs) {
cfqq->new_cfqq = new_cfqq;
atomic_add(process_refs, &new_cfqq->ref);
new_cfqq->ref += process_refs;
} else {
new_cfqq->new_cfqq = cfqq;
atomic_add(new_process_refs, &cfqq->ref);
cfqq->ref += new_process_refs;
}
}
......@@ -2116,12 +2102,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
unsigned count;
struct cfq_rb_root *st;
unsigned group_slice;
if (!cfqg) {
cfqd->serving_prio = IDLE_WORKLOAD;
cfqd->workload_expires = jiffies + 1;
return;
}
enum wl_prio_t original_prio = cfqd->serving_prio;
/* Choose next priority. RT > BE > IDLE */
if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
......@@ -2134,6 +2115,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
return;
}
if (original_prio != cfqd->serving_prio)
goto new_workload;
/*
* For RT and BE, we have to choose also the type
* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
......@@ -2148,6 +2132,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
if (count && !time_after(jiffies, cfqd->workload_expires))
return;
new_workload:
/* otherwise select new workload type */
cfqd->serving_type =
cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
......@@ -2199,7 +2184,6 @@ static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
if (RB_EMPTY_ROOT(&st->rb))
return NULL;
cfqg = cfq_rb_first_group(st);
st->active = &cfqg->rb_node;
update_min_vdisktime(st);
return cfqg;
}
......@@ -2293,6 +2277,17 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
goto keep_queue;
}
/*
* This is a deep seek queue, but the device is much faster than
* the queue can deliver, don't idle
**/
if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
(cfq_cfqq_slice_new(cfqq) ||
(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
cfq_clear_cfqq_deep(cfqq);
cfq_clear_cfqq_idle_window(cfqq);
}
if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
cfqq = NULL;
goto keep_queue;
......@@ -2367,12 +2362,12 @@ static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
{
/* the queue hasn't finished any request, can't estimate */
if (cfq_cfqq_slice_new(cfqq))
return 1;
return true;
if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
cfqq->slice_end))
return 1;
return true;
return 0;
return false;
}
static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
......@@ -2538,9 +2533,10 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
struct cfq_data *cfqd = cfqq->cfqd;
struct cfq_group *cfqg, *orig_cfqg;
BUG_ON(atomic_read(&cfqq->ref) <= 0);
BUG_ON(cfqq->ref <= 0);
if (!atomic_dec_and_test(&cfqq->ref))
cfqq->ref--;
if (cfqq->ref)
return;
cfq_log_cfqq(cfqd, cfqq, "put_queue");
......@@ -2843,7 +2839,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
RB_CLEAR_NODE(&cfqq->p_node);
INIT_LIST_HEAD(&cfqq->fifo);
atomic_set(&cfqq->ref, 0);
cfqq->ref = 0;
cfqq->cfqd = cfqd;
cfq_mark_cfqq_prio_changed(cfqq);
......@@ -2979,11 +2975,11 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
* pin the queue now that it's allocated, scheduler exit will prune it
*/
if (!is_sync && !(*async_cfqq)) {
atomic_inc(&cfqq->ref);
cfqq->ref++;
*async_cfqq = cfqq;
}
atomic_inc(&cfqq->ref);
cfqq->ref++;
return cfqq;
}
......@@ -3265,6 +3261,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
return true;
/* An idle queue should not be idle now for some reason */
if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
return true;
if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
return false;
......@@ -3681,13 +3681,13 @@ new_queue:
}
cfqq->allocated[rw]++;
atomic_inc(&cfqq->ref);
spin_unlock_irqrestore(q->queue_lock, flags);
cfqq->ref++;
rq->elevator_private = cic;
rq->elevator_private2 = cfqq;
rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
spin_unlock_irqrestore(q->queue_lock, flags);
return 0;
queue_fail:
......@@ -3862,6 +3862,10 @@ static void *cfq_init_queue(struct request_queue *q)
if (!cfqd)
return NULL;
/*
* Don't need take queue_lock in the routine, since we are
* initializing the ioscheduler, and nobody is using cfqd
*/
cfqd->cic_index = i;
/* Init root service tree */
......@@ -3881,7 +3885,7 @@ static void *cfq_init_queue(struct request_queue *q)
* Take a reference to root group which we never drop. This is just
* to make sure that cfq_put_cfqg() does not try to kfree root group
*/
atomic_set(&cfqg->ref, 1);
cfqg->ref = 1;
rcu_read_lock();
cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
(void *)cfqd, 0);
......@@ -3901,7 +3905,7 @@ static void *cfq_init_queue(struct request_queue *q)
* will not attempt to free it.
*/
cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
atomic_inc(&cfqd->oom_cfqq.ref);
cfqd->oom_cfqq.ref++;
cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
INIT_LIST_HEAD(&cfqd->cic_list);
......
......@@ -18,6 +18,7 @@
#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include "blk.h"
......@@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
static struct device_type disk_type;
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
/**
* disk_get_part - get partition
* @disk: disk to look partition from
......@@ -239,7 +244,7 @@ static struct blk_major_name {
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(int major)
static inline int major_to_index(unsigned major)
{
return major % BLKDEV_MAJOR_HASH_SIZE;
}
......@@ -502,6 +507,64 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
void register_disk(struct gendisk *disk)
{
struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
struct disk_part_iter piter;
struct hd_struct *part;
int err;
ddev->parent = disk->driverfs_dev;
dev_set_name(ddev, disk->disk_name);
/* delay uevents, until we scanned partition table */
dev_set_uevent_suppress(ddev, 1);
if (device_add(ddev))
return;
if (!sysfs_deprecated) {
err = sysfs_create_link(block_depr, &ddev->kobj,
kobject_name(&ddev->kobj));
if (err) {
device_del(ddev);
return;
}
}
disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
/* No minors to use for partitions */
if (!disk_partitionable(disk))
goto exit;
/* No such device (e.g., media were just removed) */
if (!get_capacity(disk))
goto exit;
bdev = bdget_disk(disk, 0);
if (!bdev)
goto exit;
bdev->bd_invalidated = 1;
err = blkdev_get(bdev, FMODE_READ, NULL);
if (err < 0)
goto exit;
blkdev_put(bdev, FMODE_READ);
exit:
/* announce disk after possible partitions are created */
dev_set_uevent_suppress(ddev, 0);
kobject_uevent(&ddev->kobj, KOBJ_ADD);
/* announce possible partitions */
disk_part_iter_init(&piter, disk, 0);
while ((part = disk_part_iter_next(&piter)))
kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
disk_part_iter_exit(&piter);
}
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
......@@ -551,18 +614,48 @@ void add_disk(struct gendisk *disk)
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
WARN_ON(retval);
}
disk_add_events(disk);
}
EXPORT_SYMBOL(add_disk);
EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
void unlink_gendisk(struct gendisk *disk)
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
struct hd_struct *part;
disk_del_events(disk);
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno);
delete_partition(disk, part->partno);
}
disk_part_iter_exit(&piter);
invalidate_partition(disk, 0);
blk_free_devt(disk_to_dev(disk)->devt);
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
bdi_unregister(&disk->queue->backing_dev_info);
blk_unregister_queue(disk);
blk_unregister_region(disk_devt(disk), disk->minors);
part_stat_set_all(&disk->part0, 0);
disk->part0.stamp = 0;
kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
device_del(disk_to_dev(disk));
}
EXPORT_SYMBOL(del_gendisk);
/**
* get_gendisk - get partitioning information for a given device
......@@ -735,7 +828,7 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
static void *p;
p = disk_seqf_start(seqf, pos);
if (!IS_ERR(p) && p && !*pos)
if (!IS_ERR_OR_NULL(p) && !*pos)
seq_puts(seqf, "major minor #blocks name\n\n");
return p;
}
......@@ -1005,6 +1098,7 @@ static void disk_release(struct device *dev)
{
struct gendisk *disk = dev_to_disk(dev);
disk_release_events(disk);
kfree(disk->random);
disk_replace_part_tbl(disk, NULL);
free_part_stats(&disk->part0);
......@@ -1110,29 +1204,6 @@ static int __init proc_genhd_init(void)
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */
static void media_change_notify_thread(struct work_struct *work)
{
struct gendisk *gd = container_of(work, struct gendisk, async_notify);
char event[] = "MEDIA_CHANGE=1";
char *envp[] = { event, NULL };
/*
* set enviroment vars to indicate which event this is for
* so that user space will know to go check the media status.
*/
kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
put_device(gd->driverfs_dev);
}
#if 0
void genhd_media_change_notify(struct gendisk *disk)
{
get_device(disk->driverfs_dev);
schedule_work(&disk->async_notify);
}
EXPORT_SYMBOL_GPL(genhd_media_change_notify);
#endif /* 0 */
dev_t blk_lookup_devt(const char *name, int partno)
{
dev_t devt = MKDEV(0, 0);
......@@ -1193,13 +1264,13 @@ struct gendisk *alloc_disk_node(int minors, int node_id)
}
disk->part_tbl->part[0] = &disk->part0;
hd_ref_init(&disk->part0);
disk->minors = minors;
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
INIT_WORK(&disk->async_notify,
media_change_notify_thread);
}
return disk;
}
......@@ -1291,3 +1362,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
}
EXPORT_SYMBOL(invalidate_partition);
/*
* Disk events - monitor disk events like media change and eject request.
*/
struct disk_events {
struct list_head node; /* all disk_event's */
struct gendisk *disk; /* the associated disk */
spinlock_t lock;
int block; /* event blocking depth */
unsigned int pending; /* events already sent out */
unsigned int clearing; /* events being cleared */
long poll_msecs; /* interval, -1 for default */
struct delayed_work dwork;
};
static const char *disk_events_strs[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
};
static char *disk_uevents[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
};
/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs = 0;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
long intv_msecs = 0;
/*
* If device-specific poll interval is set, always use it. If
* the default is being used, poll iff there are events which
* can't be monitored asynchronously.
*/
if (ev->poll_msecs >= 0)
intv_msecs = ev->poll_msecs;
else if (disk->events & ~disk->async_events)
intv_msecs = disk_events_dfl_poll_msecs;
return msecs_to_jiffies(intv_msecs);
}
static void __disk_block_events(struct gendisk *disk, bool sync)
{
struct disk_events *ev = disk->ev;
unsigned long flags;
bool cancel;
spin_lock_irqsave(&ev->lock, flags);
cancel = !ev->block++;
spin_unlock_irqrestore(&ev->lock, flags);
if (cancel) {
if (sync)
cancel_delayed_work_sync(&disk->ev->dwork);
else
cancel_delayed_work(&disk->ev->dwork);
}
}
static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
struct disk_events *ev = disk->ev;
unsigned long intv;
unsigned long flags;
spin_lock_irqsave(&ev->lock, flags);
if (WARN_ON_ONCE(ev->block <= 0))
goto out_unlock;
if (--ev->block)
goto out_unlock;
/*
* Not exactly a latency critical operation, set poll timer
* slack to 25% and kick event check.
*/
intv = disk_events_poll_jiffies(disk);
set_timer_slack(&ev->dwork.timer, intv / 4);
if (check_now)
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
else if (intv)
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
/**
* disk_block_events - block and flush disk event checking
* @disk: disk to block events for
*
* On return from this function, it is guaranteed that event checking
* isn't in progress and won't happen until unblocked by
* disk_unblock_events(). Events blocking is counted and the actual
* unblocking happens after the matching number of unblocks are done.
*
* Note that this intentionally does not block event checking from
* disk_clear_events().
*
* CONTEXT:
* Might sleep.
*/
void disk_block_events(struct gendisk *disk)
{
if (disk->ev)
__disk_block_events(disk, true);
}
/**
* disk_unblock_events - unblock disk event checking
* @disk: disk to unblock events for
*
* Undo disk_block_events(). When the block count reaches zero, it
* starts events polling if configured.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_unblock_events(struct gendisk *disk)
{
if (disk->ev)
__disk_unblock_events(disk, true);
}
/**
* disk_check_events - schedule immediate event checking
* @disk: disk to check events for
*
* Schedule immediate event checking on @disk if not blocked.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_check_events(struct gendisk *disk)
{
if (disk->ev) {
__disk_block_events(disk, false);
__disk_unblock_events(disk, true);
}
}
EXPORT_SYMBOL_GPL(disk_check_events);
/**
* disk_clear_events - synchronously check, clear and return pending events
* @disk: disk to fetch and clear events from
* @mask: mask of events to be fetched and clearted
*
* Disk events are synchronously checked and pending events in @mask
* are cleared and returned. This ignores the block count.
*
* CONTEXT:
* Might sleep.
*/
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
const struct block_device_operations *bdops = disk->fops;
struct disk_events *ev = disk->ev;
unsigned int pending;
if (!ev) {
/* for drivers still using the old ->media_changed method */
if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
bdops->media_changed && bdops->media_changed(disk))
return DISK_EVENT_MEDIA_CHANGE;
return 0;
}
/* tell the workfn about the events being cleared */
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
spin_unlock_irq(&ev->lock);
/* uncondtionally schedule event check and wait for it to finish */
__disk_block_events(disk, true);
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
flush_delayed_work(&ev->dwork);
__disk_unblock_events(disk, false);
/* then, fetch and clear pending events */
spin_lock_irq(&ev->lock);
WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
pending = ev->pending & mask;
ev->pending &= ~mask;
spin_unlock_irq(&ev->lock);
return pending;
}
static void disk_events_workfn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
struct gendisk *disk = ev->disk;
char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
unsigned int clearing = ev->clearing;
unsigned int events;
unsigned long intv;
int nr_events = 0, i;
/* check events */
events = disk->fops->check_events(disk, clearing);
/* accumulate pending events and schedule next poll if necessary */
spin_lock_irq(&ev->lock);
events &= ~ev->pending;
ev->pending |= events;
ev->clearing &= ~clearing;
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
spin_unlock_irq(&ev->lock);
/* tell userland about new events */
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
if (events & (1 << i))
envp[nr_events++] = disk_uevents[i];
if (nr_events)
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}
/*
* A disk events enabled device has the following sysfs nodes under
* its /sys/block/X/ directory.
*
* events : list of all supported events
* events_async : list of events which can be detected w/o polling
* events_poll_msecs : polling interval, 0: disable, -1: system default
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
const char *delim = "";
ssize_t pos = 0;
int i;
for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
if (events & (1 << i)) {
pos += sprintf(buf + pos, "%s%s",
delim, disk_events_strs[i]);
delim = " ";
}
if (pos)
pos += sprintf(buf + pos, "\n");
return pos;
}
static ssize_t disk_events_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return __disk_events_show(disk->events, buf);
}
static ssize_t disk_events_async_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return __disk_events_show(disk->async_events, buf);
}
static ssize_t disk_events_poll_msecs_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}
static ssize_t disk_events_poll_msecs_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
long intv;
if (!count || !sscanf(buf, "%ld", &intv))
return -EINVAL;
if (intv < 0 && intv != -1)
return -EINVAL;
__disk_block_events(disk, true);
disk->ev->poll_msecs = intv;
__disk_unblock_events(disk, true);
return count;
}
static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
static const struct attribute *disk_events_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
NULL,
};
/*
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
* /sys/module/block/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)
{
struct disk_events *ev;
int ret;
ret = param_set_ulong(val, kp);
if (ret < 0)
return ret;
mutex_lock(&disk_events_mutex);
list_for_each_entry(ev, &disk_events, node)
disk_check_events(ev->disk);
mutex_unlock(&disk_events_mutex);
return 0;
}
static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
.set = disk_events_set_dfl_poll_msecs,
.get = param_get_ulong,
};
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "block."
module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
&disk_events_dfl_poll_msecs, 0644);
/*
* disk_{add|del|release}_events - initialize and destroy disk_events.
*/
static void disk_add_events(struct gendisk *disk)
{
struct disk_events *ev;
if (!disk->fops->check_events || !(disk->events | disk->async_events))
return;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev) {
pr_warn("%s: failed to initialize events\n", disk->disk_name);
return;
}
if (sysfs_create_files(&disk_to_dev(disk)->kobj,
disk_events_attrs) < 0) {
pr_warn("%s: failed to create sysfs files for events\n",
disk->disk_name);
kfree(ev);
return;
}
disk->ev = ev;
INIT_LIST_HEAD(&ev->node);
ev->disk = disk;
spin_lock_init(&ev->lock);
ev->block = 1;
ev->poll_msecs = -1;
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
mutex_lock(&disk_events_mutex);
list_add_tail(&ev->node, &disk_events);
mutex_unlock(&disk_events_mutex);
/*
* Block count is initialized to 1 and the following initial
* unblock kicks it into action.
*/
__disk_unblock_events(disk, true);
}
static void disk_del_events(struct gendisk *disk)
{
if (!disk->ev)
return;
__disk_block_events(disk, true);
mutex_lock(&disk_events_mutex);
list_del_init(&disk->ev->node);
mutex_unlock(&disk_events_mutex);
sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
}
static void disk_release_events(struct gendisk *disk)
{
/* the block count should be 1 from disk_del_events() */
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}
......@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -EINVAL;
if (get_user(n, (int __user *) arg))
return -EFAULT;
if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
if (!(mode & FMODE_EXCL) &&
blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
return -EBUSY;
ret = set_blocksize(bdev, n);
if (!(mode & FMODE_EXCL))
bd_release(bdev);
blkdev_put(bdev, mode | FMODE_EXCL);
return ret;
case BLKPG:
ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
......
......@@ -911,8 +911,6 @@ struct drbd_md {
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
struct file *lo_file;
struct file *md_file;
struct drbd_md md;
struct disk_conf dc; /* The user provided config... */
sector_t known_size; /* last known size of that backing device */
......
......@@ -3372,11 +3372,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
if (ldev == NULL)
return;
bd_release(ldev->backing_bdev);
bd_release(ldev->md_bdev);
fput(ldev->lo_file);
fput(ldev->md_file);
blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(ldev);
}
......
......@@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
struct inode *inode, *inode2;
struct block_device *bdev;
struct lru_cache *resync_lru = NULL;
union drbd_state ns, os;
unsigned int max_seg_s;
......@@ -907,46 +907,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
}
}
nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
if (IS_ERR(nbc->lo_file)) {
bdev = blkdev_get_by_path(nbc->dc.backing_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
if (IS_ERR(bdev)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
PTR_ERR(nbc->lo_file));
nbc->lo_file = NULL;
PTR_ERR(bdev));
retcode = ERR_OPEN_DISK;
goto fail;
}
nbc->backing_bdev = bdev;
inode = nbc->lo_file->f_dentry->d_inode;
if (!S_ISBLK(inode->i_mode)) {
retcode = ERR_DISK_NOT_BDEV;
goto fail;
}
nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
if (IS_ERR(nbc->md_file)) {
/*
* meta_dev_idx >= 0: external fixed size, possibly multiple
* drbd sharing one meta device. TODO in that case, paranoia
* check that [md_bdev, meta_dev_idx] is not yet used by some
* other drbd minor! (if you use drbd.conf + drbdadm, that
* should check it for you already; but if you don't, or
* someone fooled it, we need to double check here)
*/
bdev = blkdev_get_by_path(nbc->dc.meta_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
(nbc->dc.meta_dev_idx < 0) ?
(void *)mdev : (void *)drbd_m_holder);
if (IS_ERR(bdev)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
PTR_ERR(nbc->md_file));
nbc->md_file = NULL;
PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail;
}
nbc->md_bdev = bdev;
inode2 = nbc->md_file->f_dentry->d_inode;
if (!S_ISBLK(inode2->i_mode)) {
retcode = ERR_MD_NOT_BDEV;
goto fail;
}
nbc->backing_bdev = inode->i_bdev;
if (bd_claim(nbc->backing_bdev, mdev)) {
printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
nbc->backing_bdev, mdev,
nbc->backing_bdev->bd_holder,
nbc->backing_bdev->bd_contains->bd_holder,
nbc->backing_bdev->bd_holders);
retcode = ERR_BDCLAIM_DISK;
if ((nbc->backing_bdev == nbc->md_bdev) !=
(nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
......@@ -955,28 +949,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
goto release_bdev_fail;
}
/* meta_dev_idx >= 0: external fixed size,
* possibly multiple drbd sharing one meta device.
* TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
* not yet used by some other drbd minor!
* (if you use drbd.conf + drbdadm,
* that should check it for you already; but if you don't, or someone
* fooled it, we need to double check here) */
nbc->md_bdev = inode2->i_bdev;
if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
: (void *) drbd_m_holder)) {
retcode = ERR_BDCLAIM_MD_DISK;
goto release_bdev_fail;
}
if ((nbc->backing_bdev == nbc->md_bdev) !=
(nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto release_bdev2_fail;
goto fail;
}
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
......@@ -987,7 +960,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
(unsigned long long) drbd_get_max_capacity(nbc),
(unsigned long long) nbc->dc.disk_size);
retcode = ERR_DISK_TO_SMALL;
goto release_bdev2_fail;
goto fail;
}
if (nbc->dc.meta_dev_idx < 0) {
......@@ -1004,7 +977,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
dev_warn(DEV, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
goto release_bdev2_fail;
goto fail;
}
/* Make sure the new disk is big enough
......@@ -1012,7 +985,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
if (drbd_get_max_capacity(nbc) <
drbd_get_capacity(mdev->this_bdev)) {
retcode = ERR_DISK_TO_SMALL;
goto release_bdev2_fail;
goto fail;
}
nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
......@@ -1035,7 +1008,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
drbd_resume_io(mdev);
if (retcode < SS_SUCCESS)
goto release_bdev2_fail;
goto fail;
if (!get_ldev_if_state(mdev, D_ATTACHING))
goto force_diskless;
......@@ -1269,18 +1242,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
force_diskless:
drbd_force_state(mdev, NS(disk, D_FAILED));
drbd_md_sync(mdev);
release_bdev2_fail:
if (nbc)
bd_release(nbc->md_bdev);
release_bdev_fail:
if (nbc)
bd_release(nbc->backing_bdev);
fail:
if (nbc) {
if (nbc->lo_file)
fput(nbc->lo_file);
if (nbc->md_file)
fput(nbc->md_file);
if (nbc->backing_bdev)
blkdev_put(nbc->backing_bdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
if (nbc->md_bdev)
blkdev_put(nbc->md_bdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc);
}
lc_destroy(resync_lru);
......
......@@ -395,11 +395,7 @@ lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct loop_device *lo = p->lo;
struct page *page = buf->page;
sector_t IV;
int size, ret;
ret = buf->ops->confirm(pipe, buf);
if (unlikely(ret))
return ret;
int size;
IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
(buf->offset >> 9);
......
......@@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
* so bdget() can't fail.
*/
bdget(pd->bdev->bd_dev);
if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
goto out;
if ((ret = bd_claim(pd->bdev, pd)))
goto out_putdev;
if ((ret = pkt_get_last_written(pd, &lba))) {
printk(DRIVER_NAME": pkt_get_last_written failed\n");
goto out_unclaim;
goto out_putdev;
}
set_capacity(pd->disk, lba << 2);
......@@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
q = bdev_get_queue(pd->bdev);
if (write) {
if ((ret = pkt_open_write(pd)))
goto out_unclaim;
goto out_putdev;
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
......@@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
}
if ((ret = pkt_set_segment_merging(pd, q)))
goto out_unclaim;
goto out_putdev;
if (write) {
if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
printk(DRIVER_NAME": not enough memory for buffers\n");
ret = -ENOMEM;
goto out_unclaim;
goto out_putdev;
}
printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
}
return 0;
out_unclaim:
bd_release(pd->bdev);
out_putdev:
blkdev_put(pd->bdev, FMODE_READ);
blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
out:
return ret;
}
......@@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
pkt_lock_door(pd, 0);
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
bd_release(pd->bdev);
blkdev_put(pd->bdev, FMODE_READ);
blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
pkt_shrink_pktlist(pd);
}
......@@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
bdev = bdget(dev);
if (!bdev)
return -ENOMEM;
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
if (ret)
return ret;
......
......@@ -1348,7 +1348,10 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
if (!CDROM_CAN(CDC_SELECT_DISC))
return -EDRIVE_CANT_DO_THIS;
(void) cdi->ops->media_changed(cdi, slot);
if (cdi->ops->check_events)
cdi->ops->check_events(cdi, 0, slot);
else
cdi->ops->media_changed(cdi, slot);
if (slot == CDSL_NONE) {
/* set media changed bits, on both queues */
......@@ -1392,6 +1395,42 @@ static int cdrom_select_disc(struct cdrom_device_info *cdi, int slot)
return slot;
}
/*
* As cdrom implements an extra ioctl consumer for media changed
* event, it needs to buffer ->check_events() output, such that event
* is not lost for both the usual VFS and ioctl paths.
* cdi->{vfs|ioctl}_events are used to buffer pending events for each
* path.
*
* XXX: Locking is non-existent. cdi->ops->check_events() can be
* called in parallel and buffering fields are accessed without any
* exclusion. The original media_changed code had the same problem.
* It might be better to simply deprecate CDROM_MEDIA_CHANGED ioctl
* and remove this cruft altogether. It doesn't have much usefulness
* at this point.
*/
static void cdrom_update_events(struct cdrom_device_info *cdi,
unsigned int clearing)
{
unsigned int events;
events = cdi->ops->check_events(cdi, clearing, CDSL_CURRENT);
cdi->vfs_events |= events;
cdi->ioctl_events |= events;
}
unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
unsigned int clearing)
{
unsigned int events;
cdrom_update_events(cdi, clearing);
events = cdi->vfs_events;
cdi->vfs_events = 0;
return events;
}
EXPORT_SYMBOL(cdrom_check_events);
/* We want to make media_changed accessible to the user through an
* ioctl. The main problem now is that we must double-buffer the
* low-level implementation, to assure that the VFS and the user both
......@@ -1403,15 +1442,26 @@ int media_changed(struct cdrom_device_info *cdi, int queue)
{
unsigned int mask = (1 << (queue & 1));
int ret = !!(cdi->mc_flags & mask);
bool changed;
if (!CDROM_CAN(CDC_MEDIA_CHANGED))
return ret;
return ret;
/* changed since last call? */
if (cdi->ops->media_changed(cdi, CDSL_CURRENT)) {
if (cdi->ops->check_events) {
BUG_ON(!queue); /* shouldn't be called from VFS path */
cdrom_update_events(cdi, DISK_EVENT_MEDIA_CHANGE);
changed = cdi->ioctl_events & DISK_EVENT_MEDIA_CHANGE;
cdi->ioctl_events = 0;
} else
changed = cdi->ops->media_changed(cdi, CDSL_CURRENT);
if (changed) {
cdi->mc_flags = 0x3; /* set bit on both queues */
ret |= 1;
cdi->media_written = 0;
}
cdi->mc_flags &= ~mask; /* clear bit */
return ret;
}
......
......@@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp)
if (!bdev)
goto out;
igrab(bdev->bd_inode);
err = blkdev_get(bdev, filp->f_mode);
err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
if (err)
goto out;
err = bd_claim(bdev, raw_open);
if (err)
goto out1;
err = set_blocksize(bdev, bdev_logical_block_size(bdev));
if (err)
goto out2;
goto out1;
filp->f_flags |= O_DIRECT;
filp->f_mapping = bdev->bd_inode->i_mapping;
if (++raw_devices[minor].inuse == 1)
......@@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp)
mutex_unlock(&raw_mutex);
return 0;
out2:
bd_release(bdev);
out1:
blkdev_put(bdev, filp->f_mode);
blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
out:
mutex_unlock(&raw_mutex);
return err;
......@@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp)
}
mutex_unlock(&raw_mutex);
bd_release(bdev);
blkdev_put(bdev, filp->f_mode);
blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
return 0;
}
......
......@@ -325,15 +325,18 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
BUG_ON(d->dm_dev.bdev);
bdev = open_by_devnum(dev, d->dm_dev.mode);
bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
if (r)
blkdev_put(bdev, d->dm_dev.mode);
else
d->dm_dev.bdev = bdev;
return r;
r = bd_link_disk_holder(bdev, dm_disk(md));
if (r) {
blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
return r;
}
d->dm_dev.bdev = bdev;
return 0;
}
/*
......@@ -344,8 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
if (!d->dm_dev.bdev)
return;
bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
d->dm_dev.bdev = NULL;
}
......
......@@ -630,7 +630,7 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
trace_block_bio_complete(md->queue, bio);
trace_block_bio_complete(md->queue, bio, io_error);
bio_endio(bio, io_error);
}
}
......@@ -990,8 +990,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
tio->io->bio->bi_bdev->bd_dev, sector);
generic_make_request(clone);
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
......
......@@ -1879,7 +1879,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
......@@ -1906,7 +1906,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
MD_BUG();
return;
}
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
......@@ -1934,19 +1933,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
shared ? (mdk_rdev_t *)lock_rdev : rdev);
if (IS_ERR(bdev)) {
printk(KERN_ERR "md: could not open %s.\n",
__bdevname(dev, b));
return PTR_ERR(bdev);
}
err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
if (err) {
printk(KERN_ERR "md: could not bd_claim %s.\n",
bdevname(bdev, b));
blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
return err;
}
if (!shared)
set_bit(AllReserved, &rdev->flags);
rdev->bdev = bdev;
......@@ -1959,8 +1952,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
rdev->bdev = NULL;
if (!bdev)
MD_BUG();
bd_release(bdev);
blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
void md_autodetect_dev(dev_t dev);
......
......@@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
if (dev->blkdev) {
invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
0, -1);
close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
kfree(dev);
......@@ -234,6 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
/* FIXME: ensure that mtd->size % erase_size == 0 */
static struct block2mtd_dev *add_device(char *devname, int erase_size)
{
const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
struct block_device *bdev;
struct block2mtd_dev *dev;
char *name;
......@@ -246,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
return NULL;
/* Get a handle on the device */
bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, NULL);
bdev = blkdev_get_by_path(devname, mode, dev);
#ifndef MODULE
if (IS_ERR(bdev)) {
......@@ -254,9 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
to resolve the device name by other means. */
dev_t devt = name_to_dev_t(devname);
if (devt) {
bdev = open_by_devnum(devt, FMODE_WRITE | FMODE_READ);
}
if (devt)
bdev = blkdev_get_by_dev(devt, mode, dev);
}
#endif
......
......@@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block)
struct block_device *bdev;
bdev = bdget_disk(block->gdp, 0);
if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0)
return -ENODEV;
/*
* See fs/partition/check.c:register_disk,rescan_partitions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment