17497acbdc
blk-mq uses percpu_ref for its usage counter which tracks the number of in-flight commands and used to synchronously drain the queue on freeze. percpu_ref shutdown takes measureable wallclock time as it involves a sched RCU grace period. This means that draining a blk-mq takes measureable wallclock time. One would think that this shouldn't matter as queue shutdown should be a rare event which takes place asynchronously w.r.t. userland. Unfortunately, SCSI probing involves synchronously setting up and then tearing down a lot of request_queues back-to-back for non-existent LUNs. This means that SCSI probing may take above ten seconds when scsi-mq is used. [ 0.949892] scsi host0: Virtio SCSI HBA [ 1.007864] scsi 0:0:0:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.021299] scsi 0:0:1:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.520356] tsc: Refined TSC clocksource calibration: 2491.910 MHz <stall> [ 16.186549] sd 0:0:0:0: Attached scsi generic sg0 type 0 [ 16.190478] sd 0:0:1:0: Attached scsi generic sg1 type 0 [ 16.194099] osd: LOADED open-osd 0.2.1 [ 16.203202] sd 0:0:0:0: [sda] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.208478] sd 0:0:0:0: [sda] Write Protect is off [ 16.211439] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA [ 16.218771] sd 0:0:1:0: [sdb] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.223264] sd 0:0:1:0: [sdb] Write Protect is off [ 16.225682] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA This is also the reason why request_queues start in bypass mode which is ended on blk_register_queue() as shutting down a fully functional queue also involves a RCU grace period and the queues for non-existent SCSI devices never reach registration. blk-mq basically needs to do the same thing - start the mq in a degraded mode which is faster to shut down and then make it fully functional only after the queue reaches registration. percpu_ref recently grew facilities to force atomic operation until explicitly switched to percpu mode, which can be used for this purpose. This patch makes blk-mq initialize q->mq_usage_counter in atomic mode and switch it to percpu mode only once blk_register_queue() is reached. Note that this issue was previously worked around by0a30288da1
("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") for v3.17. The temp fix was reverted in preparation of adding persistent atomic mode to percpu_ref by9eca80461a
("Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe""). This patch and the prerequisite percpu_ref changes will be merged during v3.18 devel cycle. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Christoph Hellwig <hch@infradead.org> Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de Fixes:add703fda9
("blk-mq: use percpu_ref for mq usage count") Reviewed-by: Kent Overstreet <kmo@daterainc.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org>
463 lines
11 KiB
C
463 lines
11 KiB
C
#include <linux/kernel.h>
|
|
#include <linux/module.h>
|
|
#include <linux/backing-dev.h>
|
|
#include <linux/bio.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/init.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/blk-mq.h>
|
|
#include "blk-mq.h"
|
|
#include "blk-mq-tag.h"
|
|
|
|
static void blk_mq_sysfs_release(struct kobject *kobj)
|
|
{
|
|
}
|
|
|
|
struct blk_mq_ctx_sysfs_entry {
|
|
struct attribute attr;
|
|
ssize_t (*show)(struct blk_mq_ctx *, char *);
|
|
ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
|
|
};
|
|
|
|
struct blk_mq_hw_ctx_sysfs_entry {
|
|
struct attribute attr;
|
|
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
|
|
ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
|
|
};
|
|
|
|
static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
|
|
char *page)
|
|
{
|
|
struct blk_mq_ctx_sysfs_entry *entry;
|
|
struct blk_mq_ctx *ctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
|
|
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
|
|
q = ctx->queue;
|
|
|
|
if (!entry->show)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->show(ctx, page);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
|
|
const char *page, size_t length)
|
|
{
|
|
struct blk_mq_ctx_sysfs_entry *entry;
|
|
struct blk_mq_ctx *ctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
|
|
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
|
|
q = ctx->queue;
|
|
|
|
if (!entry->store)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->store(ctx, page, length);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
|
|
struct attribute *attr, char *page)
|
|
{
|
|
struct blk_mq_hw_ctx_sysfs_entry *entry;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
|
|
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
|
|
q = hctx->queue;
|
|
|
|
if (!entry->show)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->show(hctx, page);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
|
|
struct attribute *attr, const char *page,
|
|
size_t length)
|
|
{
|
|
struct blk_mq_hw_ctx_sysfs_entry *entry;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct request_queue *q;
|
|
ssize_t res;
|
|
|
|
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
|
|
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
|
|
q = hctx->queue;
|
|
|
|
if (!entry->store)
|
|
return -EIO;
|
|
|
|
res = -ENOENT;
|
|
mutex_lock(&q->sysfs_lock);
|
|
if (!blk_queue_dying(q))
|
|
res = entry->store(hctx, page, length);
|
|
mutex_unlock(&q->sysfs_lock);
|
|
return res;
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
|
|
{
|
|
return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
|
|
ctx->rq_dispatched[0]);
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
|
|
{
|
|
return sprintf(page, "%lu\n", ctx->rq_merged);
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
|
|
{
|
|
return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
|
|
ctx->rq_completed[0]);
|
|
}
|
|
|
|
static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
|
|
{
|
|
char *start_page = page;
|
|
struct request *rq;
|
|
|
|
page += sprintf(page, "%s:\n", msg);
|
|
|
|
list_for_each_entry(rq, list, queuelist)
|
|
page += sprintf(page, "\t%p\n", rq);
|
|
|
|
return page - start_page;
|
|
}
|
|
|
|
static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
|
|
{
|
|
ssize_t ret;
|
|
|
|
spin_lock(&ctx->lock);
|
|
ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
|
|
spin_unlock(&ctx->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
|
|
char *page)
|
|
{
|
|
return sprintf(page, "%lu\n", hctx->queued);
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
{
|
|
return sprintf(page, "%lu\n", hctx->run);
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
|
|
char *page)
|
|
{
|
|
char *start_page = page;
|
|
int i;
|
|
|
|
page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
|
|
|
|
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
|
|
unsigned long d = 1U << (i - 1);
|
|
|
|
page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
|
|
}
|
|
|
|
return page - start_page;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
|
|
char *page)
|
|
{
|
|
ssize_t ret;
|
|
|
|
spin_lock(&hctx->lock);
|
|
ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
|
|
spin_unlock(&hctx->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
{
|
|
return blk_mq_tag_sysfs_show(hctx->tags, page);
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
{
|
|
return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
|
|
}
|
|
|
|
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
|
|
{
|
|
unsigned int i, first = 1;
|
|
ssize_t ret = 0;
|
|
|
|
blk_mq_disable_hotplug();
|
|
|
|
for_each_cpu(i, hctx->cpumask) {
|
|
if (first)
|
|
ret += sprintf(ret + page, "%u", i);
|
|
else
|
|
ret += sprintf(ret + page, ", %u", i);
|
|
|
|
first = 0;
|
|
}
|
|
|
|
blk_mq_enable_hotplug();
|
|
|
|
ret += sprintf(ret + page, "\n");
|
|
return ret;
|
|
}
|
|
|
|
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
|
|
.attr = {.name = "dispatched", .mode = S_IRUGO },
|
|
.show = blk_mq_sysfs_dispatched_show,
|
|
};
|
|
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
|
|
.attr = {.name = "merged", .mode = S_IRUGO },
|
|
.show = blk_mq_sysfs_merged_show,
|
|
};
|
|
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
|
|
.attr = {.name = "completed", .mode = S_IRUGO },
|
|
.show = blk_mq_sysfs_completed_show,
|
|
};
|
|
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
|
|
.attr = {.name = "rq_list", .mode = S_IRUGO },
|
|
.show = blk_mq_sysfs_rq_list_show,
|
|
};
|
|
|
|
static struct attribute *default_ctx_attrs[] = {
|
|
&blk_mq_sysfs_dispatched.attr,
|
|
&blk_mq_sysfs_merged.attr,
|
|
&blk_mq_sysfs_completed.attr,
|
|
&blk_mq_sysfs_rq_list.attr,
|
|
NULL,
|
|
};
|
|
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
|
|
.attr = {.name = "queued", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_queued_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
|
|
.attr = {.name = "run", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_run_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
|
|
.attr = {.name = "dispatched", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_dispatched_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
|
|
.attr = {.name = "active", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_active_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
|
|
.attr = {.name = "pending", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_rq_list_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
|
|
.attr = {.name = "tags", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_tags_show,
|
|
};
|
|
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
|
|
.attr = {.name = "cpu_list", .mode = S_IRUGO },
|
|
.show = blk_mq_hw_sysfs_cpus_show,
|
|
};
|
|
|
|
static struct attribute *default_hw_ctx_attrs[] = {
|
|
&blk_mq_hw_sysfs_queued.attr,
|
|
&blk_mq_hw_sysfs_run.attr,
|
|
&blk_mq_hw_sysfs_dispatched.attr,
|
|
&blk_mq_hw_sysfs_pending.attr,
|
|
&blk_mq_hw_sysfs_tags.attr,
|
|
&blk_mq_hw_sysfs_cpus.attr,
|
|
&blk_mq_hw_sysfs_active.attr,
|
|
NULL,
|
|
};
|
|
|
|
static const struct sysfs_ops blk_mq_sysfs_ops = {
|
|
.show = blk_mq_sysfs_show,
|
|
.store = blk_mq_sysfs_store,
|
|
};
|
|
|
|
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
|
|
.show = blk_mq_hw_sysfs_show,
|
|
.store = blk_mq_hw_sysfs_store,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_ktype = {
|
|
.sysfs_ops = &blk_mq_sysfs_ops,
|
|
.release = blk_mq_sysfs_release,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_ctx_ktype = {
|
|
.sysfs_ops = &blk_mq_sysfs_ops,
|
|
.default_attrs = default_ctx_attrs,
|
|
.release = blk_mq_sysfs_release,
|
|
};
|
|
|
|
static struct kobj_type blk_mq_hw_ktype = {
|
|
.sysfs_ops = &blk_mq_hw_sysfs_ops,
|
|
.default_attrs = default_hw_ctx_attrs,
|
|
.release = blk_mq_sysfs_release,
|
|
};
|
|
|
|
static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
|
|
{
|
|
struct blk_mq_ctx *ctx;
|
|
int i;
|
|
|
|
if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
|
|
return;
|
|
|
|
hctx_for_each_ctx(hctx, ctx, i)
|
|
kobject_del(&ctx->kobj);
|
|
|
|
kobject_del(&hctx->kobj);
|
|
}
|
|
|
|
static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
|
|
{
|
|
struct request_queue *q = hctx->queue;
|
|
struct blk_mq_ctx *ctx;
|
|
int i, ret;
|
|
|
|
if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
|
|
return 0;
|
|
|
|
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
|
|
if (ret)
|
|
return ret;
|
|
|
|
hctx_for_each_ctx(hctx, ctx, i) {
|
|
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
void blk_mq_unregister_disk(struct gendisk *disk)
|
|
{
|
|
struct request_queue *q = disk->queue;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct blk_mq_ctx *ctx;
|
|
int i, j;
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
blk_mq_unregister_hctx(hctx);
|
|
|
|
hctx_for_each_ctx(hctx, ctx, j)
|
|
kobject_put(&ctx->kobj);
|
|
|
|
kobject_put(&hctx->kobj);
|
|
}
|
|
|
|
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
|
|
kobject_del(&q->mq_kobj);
|
|
kobject_put(&q->mq_kobj);
|
|
|
|
kobject_put(&disk_to_dev(disk)->kobj);
|
|
}
|
|
|
|
static void blk_mq_sysfs_init(struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
struct blk_mq_ctx *ctx;
|
|
int i, j;
|
|
|
|
kobject_init(&q->mq_kobj, &blk_mq_ktype);
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
|
|
|
|
hctx_for_each_ctx(hctx, ctx, j)
|
|
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
|
|
}
|
|
}
|
|
|
|
/* see blk_register_queue() */
|
|
void blk_mq_finish_init(struct request_queue *q)
|
|
{
|
|
percpu_ref_switch_to_percpu(&q->mq_usage_counter);
|
|
}
|
|
|
|
int blk_mq_register_disk(struct gendisk *disk)
|
|
{
|
|
struct device *dev = disk_to_dev(disk);
|
|
struct request_queue *q = disk->queue;
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int ret, i;
|
|
|
|
blk_mq_sysfs_init(q);
|
|
|
|
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
hctx->flags |= BLK_MQ_F_SYSFS_UP;
|
|
ret = blk_mq_register_hctx(hctx);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
if (ret) {
|
|
blk_mq_unregister_disk(disk);
|
|
return ret;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void blk_mq_sysfs_unregister(struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int i;
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i)
|
|
blk_mq_unregister_hctx(hctx);
|
|
}
|
|
|
|
int blk_mq_sysfs_register(struct request_queue *q)
|
|
{
|
|
struct blk_mq_hw_ctx *hctx;
|
|
int i, ret = 0;
|
|
|
|
queue_for_each_hw_ctx(q, hctx, i) {
|
|
ret = blk_mq_register_hctx(hctx);
|
|
if (ret)
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|