for-5.9/drivers-20200803

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl8od3oQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgppkpD/9D+XqD9qYcYTj+ShVCc5+3RtMG5ZiAAX0y
 l4QXomentn/1Y0UYXFGJH7JLZWrKYT0QiktLtfpe5pmTqRUkckTIyJQlsHb+K6Dz
 lFjtywRK9pcFYgiWIUg80wlJKrTa8QdnrlS/Esn4YITKGRbgMIdFvq2jymXC+1ho
 RgodlgzcBUREgHSLo0H3cqEKA53fQiJhKC6CbFrFdrkpf2yUpcTfEDtpSwuIuPj3
 2AUed1qXUtNjdHciCn3N37OuHqXKAA9noXAWfg9Gx/5zfGUNX9QJvlsny1AopgS0
 jJvPSDVAhu/qRLHW6q/ZOT0JAlHegguuTAOtgMh2cMpAS5sumCAtltxVcI7Qnx41
 HalMpTefXsVoBo0gfjqldnIPt34ZNj5aH5GYaH/wPpSg6VkTVBJK8GuQDBvg27qT
 w+U/T6EzuqniWXh/P3COhfrMCR9ueUOY1qWCRwzomlpeIfBhCzidt2wUqIxX1TOA
 Q0Ltf0eERDevsZbE+tIm+VAAg98kHehcS2t8lfFYFO6/PKu2iJpJt/HtJbZNBE+W
 rm96E4qXRiy1UuL7D9vBkaWsbnosuNHgGQXx57GlokQU+2IGBmOxV52XHiSxxpXd
 AS1ZTd56ItmID8VaU09Pbf7ZFbiCgdEAxIbUFzaCuvo+lxryHFphIUARNi/zPnNT
 UC2OzunCqA==
 =oADH
 -----END PGP SIGNATURE-----

Merge tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block

Pull block driver updates from Jens Axboe:

 - NVMe:
      - ZNS support (Aravind, Keith, Matias, Niklas)
      - Misc cleanups, optimizations, fixes (Baolin, Chaitanya, David,
        Dongli, Max, Sagi)

 - null_blk zone capacity support (Aravind)

 - MD:
      - raid5/6 fixes (ChangSyun)
      - Warning fixes (Damien)
      - raid5 stripe fixes (Guoqing, Song, Yufen)
      - sysfs deadlock fix (Junxiao)
      - raid10 deadlock fix (Vitaly)

 - struct_size conversions (Gustavo)

 - Set of bcache updates/fixes (Coly)

* tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block: (117 commits)
  md/raid5: Allow degraded raid6 to do rmw
  md/raid5: Fix Force reconstruct-write io stuck in degraded raid5
  raid5: don't duplicate code for different paths in handle_stripe
  raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show
  md: print errno in super_written
  md/raid5: remove the redundant setting of STRIPE_HANDLE
  md: register new md sysfs file 'uuid' read-only
  md: fix max sectors calculation for super 1.0
  nvme-loop: remove extra variable in create ctrl
  nvme-loop: set ctrl state connecting after init
  nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths
  nvme-multipath: fix logic for non-optimized paths
  nvme-rdma: fix controller reset hang during traffic
  nvme-tcp: fix controller reset hang during traffic
  nvmet: introduce the passthru Kconfig option
  nvmet: introduce the passthru configfs interface
  nvmet: Add passthru enable/disable helpers
  nvmet: add passthru code to process commands
  nvme: export nvme_find_get_ns() and nvme_put_ns()
  nvme: introduce nvme_ctrl_get_by_path()
  ...
This commit is contained in:
Linus Torvalds 2020-08-05 10:51:40 -07:00
commit e0fc99e21e
70 changed files with 3149 additions and 826 deletions

View File

@ -273,6 +273,24 @@ Description:
device ("host-aware" or "host-managed" zone model). For regular
block devices, the value is always 0.
What: /sys/block/<disk>/queue/max_active_zones
Date: July 2020
Contact: Niklas Cassel <niklas.cassel@wdc.com>
Description:
For zoned block devices (zoned attribute indicating
"host-managed" or "host-aware"), the sum of zones belonging to
any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED,
is limited by this value. If this value is 0, there is no limit.
What: /sys/block/<disk>/queue/max_open_zones
Date: July 2020
Contact: Niklas Cassel <niklas.cassel@wdc.com>
Description:
For zoned block devices (zoned attribute indicating
"host-managed" or "host-aware"), the sum of zones belonging to
any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN,
is limited by this value. If this value is 0, there is no limit.
What: /sys/block/<disk>/queue/chunk_sectors
Date: September 2016
Contact: Hannes Reinecke <hare@suse.com>

View File

@ -426,6 +426,10 @@ All md devices contain:
The accepted values when writing to this file are ``ppl`` and ``resync``,
used to enable and disable PPL.
uuid
This indicates the UUID of the array in the following format:
xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
As component devices are added to an md array, they appear in the ``md``
directory as new directories named::

View File

@ -117,6 +117,20 @@ Maximum number of elements in a DMA scatter/gather list with integrity
data that will be submitted by the block layer core to the associated
block driver.
max_active_zones (RO)
---------------------
For zoned block devices (zoned attribute indicating "host-managed" or
"host-aware"), the sum of zones belonging to any of the zone states:
EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value.
If this value is 0, there is no limit.
max_open_zones (RO)
-------------------
For zoned block devices (zoned attribute indicating "host-managed" or
"host-aware"), the sum of zones belonging to any of the zone states:
EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value.
If this value is 0, there is no limit.
max_sectors_kb (RW)
-------------------
This is the maximum number of kilobytes that the block layer will allow

View File

@ -86,9 +86,10 @@ config BLK_DEV_ZONED
select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC host-managed and host-aware zoned block devices.
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
devices.
Say yes here if you have a ZAC or ZBC storage device.
Say yes here if you have a ZAC, ZBC, or ZNS storage device.
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"

View File

@ -306,6 +306,16 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_nr_zones(q), page);
}
static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_max_open_zones(q), page);
}
static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_max_active_zones(q), page);
}
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
{
return queue_var_show((blk_queue_nomerges(q) << 1) |
@ -668,6 +678,16 @@ static struct queue_sysfs_entry queue_nr_zones_entry = {
.show = queue_nr_zones_show,
};
static struct queue_sysfs_entry queue_max_open_zones_entry = {
.attr = {.name = "max_open_zones", .mode = 0444 },
.show = queue_max_open_zones_show,
};
static struct queue_sysfs_entry queue_max_active_zones_entry = {
.attr = {.name = "max_active_zones", .mode = 0444 },
.show = queue_max_active_zones_show,
};
static struct queue_sysfs_entry queue_nomerges_entry = {
.attr = {.name = "nomerges", .mode = 0644 },
.show = queue_nomerges_show,
@ -766,6 +786,8 @@ static struct attribute *queue_attrs[] = {
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
&queue_max_open_zones_entry.attr,
&queue_max_active_zones_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
@ -793,6 +815,11 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
(!q->mq_ops || !q->mq_ops->timeout))
return 0;
if ((attr == &queue_max_open_zones_entry.attr ||
attr == &queue_max_active_zones_entry.attr) &&
!blk_queue_is_zoned(q))
return 0;
return attr->mode;
}

View File

@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
rep.nr_zones = ret;
rep.flags = BLK_ZONE_REP_CAPACITY;
if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
return -EFAULT;
return 0;

View File

@ -45,6 +45,9 @@ static const guid_t prp_guids[] = {
/* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */
GUID_INIT(0x6c501103, 0xc189, 0x4296,
0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d),
/* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */
GUID_INIT(0x5025030f, 0x842f, 0x4ab4,
0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0),
};
/* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */

View File

@ -49,6 +49,7 @@ struct nullb_device {
unsigned long completion_nsec; /* time in ns to complete a request */
unsigned long cache_size; /* disk cache size in MB */
unsigned long zone_size; /* zone size in MB if device is zoned */
unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int submit_queues; /* number of submission queues */
unsigned int home_node; /* home node for the device */

View File

@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256;
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
static unsigned long g_zone_capacity;
module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
static unsigned int g_zone_nr_conv;
module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL);
NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
NULLB_DEVICE_ATTR(zoned, bool, NULL);
NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_badblocks,
&nullb_device_attr_zoned,
&nullb_device_attr_zone_size,
&nullb_device_attr_zone_capacity,
&nullb_device_attr_zone_nr_conv,
NULL,
};
@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n");
return snprintf(page, PAGE_SIZE,
"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->use_per_node_hctx = g_use_per_node_hctx;
dev->zoned = g_zoned;
dev->zone_size = g_zone_size;
dev->zone_capacity = g_zone_capacity;
dev->zone_nr_conv = g_zone_nr_conv;
return dev;
}

View File

@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
return -EINVAL;
}
if (!dev->zone_capacity)
dev->zone_capacity = dev->zone_size;
if (dev->zone_capacity > dev->zone_size) {
pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n",
dev->zone_capacity, dev->zone_size);
return -EINVAL;
}
dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
dev->nr_zones = dev_size >>
(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
@ -47,6 +56,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
zone->start = sector;
zone->len = dev->zone_size_sects;
zone->capacity = zone->len;
zone->wp = zone->start + zone->len;
zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
zone->cond = BLK_ZONE_COND_NOT_WP;
@ -59,6 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
zone->start = zone->wp = sector;
zone->len = dev->zone_size_sects;
zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT;
zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone->cond = BLK_ZONE_COND_EMPTY;
@ -185,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
return BLK_STS_IOERR;
}
if (zone->wp + nr_sectors > zone->start + zone->capacity)
return BLK_STS_IOERR;
if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
@ -193,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
return ret;
zone->wp += nr_sectors;
if (zone->wp == zone->start + zone->len)
if (zone->wp == zone->start + zone->capacity)
zone->cond = BLK_ZONE_COND_FULL;
return BLK_STS_OK;
default:

View File

@ -562,13 +562,15 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
for (i = 0; i < card->n_targets; i++) {
if (card->ctrl[i].status.buf)
pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
card->ctrl[i].status.buf,
card->ctrl[i].status.dma_addr);
dma_free_coherent(&card->dev->dev,
STATUS_BUFFER_SIZE8,
card->ctrl[i].status.buf,
card->ctrl[i].status.dma_addr);
if (card->ctrl[i].cmd.buf)
pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
card->ctrl[i].cmd.buf,
card->ctrl[i].cmd.dma_addr);
dma_free_coherent(&card->dev->dev,
COMMAND_BUFFER_SIZE8,
card->ctrl[i].cmd.buf,
card->ctrl[i].cmd.dma_addr);
}
return 0;
@ -711,15 +713,15 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
failed_hw_buffers_init:
for (i = 0; i < card->n_targets; i++) {
if (card->ctrl[i].status.buf)
pci_free_consistent(card->dev,
STATUS_BUFFER_SIZE8,
card->ctrl[i].status.buf,
card->ctrl[i].status.dma_addr);
dma_free_coherent(&card->dev->dev,
STATUS_BUFFER_SIZE8,
card->ctrl[i].status.buf,
card->ctrl[i].status.dma_addr);
if (card->ctrl[i].cmd.buf)
pci_free_consistent(card->dev,
COMMAND_BUFFER_SIZE8,
card->ctrl[i].cmd.buf,
card->ctrl[i].cmd.dma_addr);
dma_free_coherent(&card->dev->dev,
COMMAND_BUFFER_SIZE8,
card->ctrl[i].cmd.buf,
card->ctrl[i].cmd.dma_addr);
}
failed_hw_setup:
rsxx_eeh_failure(dev);

View File

@ -27,7 +27,7 @@ config BCACHE_CLOSURES_DEBUG
interface to list them, which makes it possible to see asynchronous
operations that get stuck.
config BCACHE_ASYNC_REGISTRAION
config BCACHE_ASYNC_REGISTRATION
bool "Asynchronous device registration (EXPERIMENTAL)"
depends on BCACHE
help

View File

@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
util.o writeback.o
util.o writeback.o features.o

View File

@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
{
struct cache *ca;
struct bucket *b;
unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
unsigned int i;
int r;

View File

@ -264,7 +264,7 @@ struct bcache_device {
#define BCACHE_DEV_UNLINK_DONE 2
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
unsigned int nr_stripes;
int nr_stripes;
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@ -762,11 +762,32 @@ struct bbio {
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
#define block_bytes(c) ((c)->sb.block_size << 9)
#define prios_per_bucket(c) \
((bucket_bytes(c) - sizeof(struct prio_set)) / \
static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
{
unsigned int n, max_pages;
max_pages = min_t(unsigned int,
__rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
MAX_ORDER_NR_PAGES);
n = sb->bucket_size / PAGE_SECTORS;
if (n > max_pages)
n = max_pages;
return n;
}
static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
{
return meta_bucket_pages(sb) << PAGE_SHIFT;
}
#define prios_per_bucket(ca) \
((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
#define prio_buckets(c) \
DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
#define prio_buckets(ca) \
DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
{

View File

@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b,
b->page_order = page_order;
t->data = (void *) __get_free_pages(gfp, b->page_order);
t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order);
if (!t->data)
goto err;

View File

@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c)
if (c->verify_data)
list_move(&c->verify_data->list, &c->btree_cache);
free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb)));
#endif
list_splice(&c->btree_cache_freeable,
@ -785,7 +785,15 @@ int bch_btree_cache_alloc(struct cache_set *c)
mutex_init(&c->verify_lock);
c->verify_ondisk = (void *)
__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
__get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb)));
if (!c->verify_ondisk) {
/*
* Don't worry about the mca_rereserve buckets
* allocated in previous for-loop, they will be
* handled properly in bch_cache_set_unregister().
*/
return -ENOMEM;
}
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);

View File

@ -0,0 +1,75 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Feature set bits and string conversion.
* Inspired by ext4's features compat/incompat/ro_compat related code.
*
* Copyright 2020 Coly Li <colyli@suse.de>
*
*/
#include <linux/bcache.h>
#include "bcache.h"
#include "features.h"
struct feature {
int compat;
unsigned int mask;
const char *string;
};
static struct feature feature_list[] = {
{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET,
"large_bucket"},
{0, 0, 0 },
};
#define compose_feature_string(type) \
({ \
struct feature *f; \
bool first = true; \
\
for (f = &feature_list[0]; f->compat != 0; f++) { \
if (f->compat != BCH_FEATURE_ ## type) \
continue; \
if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \
if (first) { \
out += snprintf(out, buf + size - out, \
"["); \
} else { \
out += snprintf(out, buf + size - out, \
" ["); \
} \
} else if (!first) { \
out += snprintf(out, buf + size - out, " "); \
} \
\
out += snprintf(out, buf + size - out, "%s", f->string);\
\
if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \
out += snprintf(out, buf + size - out, "]"); \
\
first = false; \
} \
if (!first) \
out += snprintf(out, buf + size - out, "\n"); \
})
int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size)
{
char *out = buf;
compose_feature_string(COMPAT);
return out - buf;
}
int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size)
{
char *out = buf;
compose_feature_string(RO_COMPAT);
return out - buf;
}
int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size)
{
char *out = buf;
compose_feature_string(INCOMPAT);
return out - buf;
}

View File

@ -0,0 +1,86 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _BCACHE_FEATURES_H
#define _BCACHE_FEATURES_H
#include <linux/bcache.h>
#include <linux/kernel.h>
#include <linux/types.h>
#define BCH_FEATURE_COMPAT 0
#define BCH_FEATURE_RO_COMPAT 1
#define BCH_FEATURE_INCOMPAT 2
#define BCH_FEATURE_TYPE_MASK 0x03
/* Feature set definition */
/* Incompat feature set */
#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */
#define BCH_FEATURE_COMPAT_SUUP 0
#define BCH_FEATURE_RO_COMPAT_SUUP 0
#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET
#define BCH_HAS_COMPAT_FEATURE(sb, mask) \
((sb)->feature_compat & (mask))
#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \
((sb)->feature_ro_compat & (mask))
#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
((sb)->feature_incompat & (mask))
#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
return (((sb)->feature_compat & \
BCH##_FEATURE_COMPAT_##flagname) != 0); \
} \
static inline void bch_set_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_compat |= \
BCH##_FEATURE_COMPAT_##flagname; \
} \
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_compat &= \
~BCH##_FEATURE_COMPAT_##flagname; \
}
#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
return (((sb)->feature_ro_compat & \
BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
} \
static inline void bch_set_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_ro_compat |= \
BCH##_FEATURE_RO_COMPAT_##flagname; \
} \
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_ro_compat &= \
~BCH##_FEATURE_RO_COMPAT_##flagname; \
}
#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
return (((sb)->feature_incompat & \
BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
} \
static inline void bch_set_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_incompat |= \
BCH##_FEATURE_INCOMPAT_##flagname; \
} \
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
{ \
(sb)->feature_incompat &= \
~BCH##_FEATURE_INCOMPAT_##flagname; \
}
BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size);
int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size);
int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size);
#endif

View File

@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
struct bio *bio = &b->bio;
bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb));
return bio;
}

View File

@ -217,10 +217,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
*/
pr_debug("falling back to linear search\n");
for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
l < ca->sb.njournal_buckets;
l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets,
l + 1))
for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
if (read_bucket(l))
goto bsearch;
@ -999,8 +996,8 @@ int bch_journal_alloc(struct cache_set *c)
j->w[1].c = c;
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) ||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
return -ENOMEM;
return 0;

View File

@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c)
continue;
}
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c)
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i) {
unsigned int sectors_to_move = 0;
unsigned int reserve_sectors = ca->sb.bucket_size *
unsigned long sectors_to_move = 0;
unsigned long reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
ca->heap.used = 0;

View File

@ -668,7 +668,9 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
bio_end_io_acct(s->orig_bio, s->start_time);
/* Count on bcache device */
disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
s->orig_bio->bi_status = s->iop.status;
bio_endio(s->orig_bio);
@ -728,8 +730,8 @@ static inline struct search *search_alloc(struct bio *bio,
s->recoverable = 1;
s->write = op_is_write(bio_op(bio));
s->read_dirty_data = 0;
s->start_time = bio_start_io_acct(bio);
/* Count on the bcache device */
s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
s->iop.c = d->c;
s->iop.bio = NULL;
s->iop.inode = d->id;
@ -1080,7 +1082,8 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
bio_end_io_acct(bio, ddip->start_time);
/* Count on the bcache device */
disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time);
if (bio->bi_status) {
struct cached_dev *dc = container_of(ddip->d,
@ -1105,7 +1108,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
*/
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
ddip->d = d;
ddip->start_time = bio_start_io_acct(bio);
/* Count on the bcache device */
ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
ddip->bi_end_io = bio->bi_end_io;
ddip->bi_private = bio->bi_private;
bio->bi_end_io = detached_dev_end_io;

View File

@ -13,6 +13,7 @@
#include "extents.h"
#include "request.h"
#include "writeback.h"
#include "features.h"
#include <linux/blkdev.h>
#include <linux/debugfs.h>
@ -59,6 +60,92 @@ struct workqueue_struct *bch_journal_wq;
/* Superblock */
static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
{
unsigned int bucket_size = le16_to_cpu(s->bucket_size);
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
bch_has_feature_large_bucket(sb))
bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
return bucket_size;
}
static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
struct cache_sb_disk *s)
{
const char *err;
unsigned int i;
sb->first_bucket= le16_to_cpu(s->first_bucket);
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->bucket_size = get_bucket_size(sb, s);
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
err = "Too many journal buckets";
if (sb->keys > SB_JOURNAL_BUCKETS)
goto err;
err = "Too many buckets";
if (sb->nbuckets > LONG_MAX)
goto err;
err = "Not enough buckets";
if (sb->nbuckets < 1 << 7)
goto err;
err = "Bad block size (not power of 2)";
if (!is_power_of_2(sb->block_size))
goto err;
err = "Bad block size (larger than page size)";
if (sb->block_size > PAGE_SECTORS)
goto err;
err = "Bad bucket size (not power of 2)";
if (!is_power_of_2(sb->bucket_size))
goto err;
err = "Bad bucket size (smaller than page size)";
if (sb->bucket_size < PAGE_SECTORS)
goto err;
err = "Invalid superblock: device too small";
if (get_capacity(bdev->bd_disk) <
sb->bucket_size * sb->nbuckets)
goto err;
err = "Bad UUID";
if (bch_is_zero(sb->set_uuid, 16))
goto err;
err = "Bad cache device number in set";
if (!sb->nr_in_set ||
sb->nr_in_set <= sb->nr_this_dev ||
sb->nr_in_set > MAX_CACHES_PER_SET)
goto err;
err = "Journal buckets not sequential";
for (i = 0; i < sb->keys; i++)
if (sb->d[i] != sb->first_bucket + i)
goto err;
err = "Too many journal buckets";
if (sb->first_bucket + sb->keys > sb->nbuckets)
goto err;
err = "Invalid superblock: first bucket comes before end of super";
if (sb->first_bucket * sb->bucket_size < 16)
goto err;
err = NULL;
err:
return err;
}
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
struct cache_sb_disk **res)
{
@ -84,7 +171,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->flags = le64_to_cpu(s->flags);
sb->seq = le64_to_cpu(s->seq);
sb->last_mount = le32_to_cpu(s->last_mount);
sb->first_bucket = le16_to_cpu(s->first_bucket);
sb->keys = le16_to_cpu(s->keys);
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
@ -101,10 +187,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
if (memcmp(sb->magic, bcache_magic, 16))
goto err;
err = "Too many journal buckets";
if (sb->keys > SB_JOURNAL_BUCKETS)
goto err;
err = "Bad checksum";
if (s->csum != csum_set(s))
goto err;
@ -124,6 +206,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
sb->data_offset = BDEV_DATA_START_DEFAULT;
break;
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
sb->data_offset = le64_to_cpu(s->data_offset);
err = "Bad data offset";
@ -133,55 +216,21 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
break;
case BCACHE_SB_VERSION_CDEV:
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
sb->nbuckets = le64_to_cpu(s->nbuckets);
sb->bucket_size = le16_to_cpu(s->bucket_size);
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
err = "Too many buckets";
if (sb->nbuckets > LONG_MAX)
err = read_super_common(sb, bdev, s);
if (err)
goto err;
err = "Not enough buckets";
if (sb->nbuckets < 1 << 7)
break;
case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
/*
* Feature bits are needed in read_super_common(),
* convert them firstly.
*/
sb->feature_compat = le64_to_cpu(s->feature_compat);
sb->feature_incompat = le64_to_cpu(s->feature_incompat);
sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
err = read_super_common(sb, bdev, s);
if (err)
goto err;
err = "Bad block/bucket size";
if (!is_power_of_2(sb->block_size) ||
sb->block_size > PAGE_SECTORS ||
!is_power_of_2(sb->bucket_size) ||
sb->bucket_size < PAGE_SECTORS)
goto err;
err = "Invalid superblock: device too small";
if (get_capacity(bdev->bd_disk) <
sb->bucket_size * sb->nbuckets)
goto err;
err = "Bad UUID";
if (bch_is_zero(sb->set_uuid, 16))
goto err;
err = "Bad cache device number in set";
if (!sb->nr_in_set ||
sb->nr_in_set <= sb->nr_this_dev ||
sb->nr_in_set > MAX_CACHES_PER_SET)
goto err;
err = "Journal buckets not sequential";
for (i = 0; i < sb->keys; i++)
if (sb->d[i] != sb->first_bucket + i)
goto err;
err = "Too many journal buckets";
if (sb->first_bucket + sb->keys > sb->nbuckets)
goto err;
err = "Invalid superblock: first bucket comes before end of super";
if (sb->first_bucket * sb->bucket_size < 16)
goto err;
break;
default:
err = "Unsupported superblock version";
@ -217,7 +266,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
offset_in_page(out));
out->offset = cpu_to_le64(sb->offset);
out->version = cpu_to_le64(sb->version);
memcpy(out->uuid, sb->uuid, 16);
memcpy(out->set_uuid, sb->set_uuid, 16);
@ -233,6 +281,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
for (i = 0; i < sb->keys; i++)
out->d[i] = cpu_to_le64(sb->d[i]);
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
out->feature_compat = cpu_to_le64(sb->feature_compat);
out->feature_incompat = cpu_to_le64(sb->feature_incompat);
out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
}
out->version = cpu_to_le64(sb->version);
out->csum = csum_set(out);
pr_debug("ver %llu, flags %llu, seq %llu\n",
@ -289,17 +344,20 @@ void bcache_write_super(struct cache_set *c)
{
struct closure *cl = &c->sb_write;
struct cache *ca;
unsigned int i;
unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
down(&c->sb_write_mutex);
closure_init(cl, &c->cl);
c->sb.seq++;
if (c->sb.version > version)
version = c->sb.version;
for_each_cache(ca, c, i) {
struct bio *bio = &ca->sb_bio;
ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
ca->sb.version = version;
ca->sb.seq = c->sb.seq;
ca->sb.last_mount = c->sb.last_mount;
@ -423,6 +481,7 @@ static int __uuid_write(struct cache_set *c)
BKEY_PADDED(key) k;
struct closure cl;
struct cache *ca;
unsigned int size;
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
@ -430,7 +489,8 @@ static int __uuid_write(struct cache_set *c)
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
return 1;
SET_KEY_SIZE(&k.key, c->sb.bucket_size);
size = meta_bucket_pages(&c->sb) * PAGE_SECTORS;
SET_KEY_SIZE(&k.key, size);
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
closure_sync(&cl);
@ -518,7 +578,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
bio_set_dev(bio, ca->bdev);
bio->bi_iter.bi_size = bucket_bytes(ca);
bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
bio->bi_end_io = prio_endio;
bio->bi_private = ca;
@ -576,7 +636,7 @@ int bch_prio_write(struct cache *ca, bool wait)
p->next_bucket = ca->prio_buckets[i + 1];
p->magic = pset_magic(&ca->sb);
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
BUG_ON(bucket == -1);
@ -629,7 +689,7 @@ static int prio_read(struct cache *ca, uint64_t bucket)
prio_io(ca, bucket, REQ_OP_READ, 0);
if (p->csum !=
bch_crc64(&p->magic, bucket_bytes(ca) - 8)) {
bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
pr_warn("bad csum reading priorities\n");
goto out;
}
@ -835,19 +895,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
size_t n;
uint64_t n;
int idx;
if (!d->stripe_size)
d->stripe_size = 1 << 31;
d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
if (!d->nr_stripes || d->nr_stripes > max_stripes) {
pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n",
(unsigned int)d->nr_stripes);
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
if (!n || n > max_stripes) {
pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
n);
return -ENOMEM;
}
d->nr_stripes = n;
n = d->nr_stripes * sizeof(atomic_t);
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
@ -1620,7 +1680,7 @@ static void cache_set_free(struct closure *cl)
}
bch_bset_sort_state_free(&c->sort);
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb)));
if (c->moving_gc_wq)
destroy_workqueue(c->moving_gc_wq);
@ -1783,7 +1843,10 @@ void bch_cache_set_unregister(struct cache_set *c)
}
#define alloc_bucket_pages(gfp, c) \
((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
#define alloc_meta_bucket_pages(gfp, sb) \
((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
{
@ -1814,12 +1877,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->sb.bucket_size = sb->bucket_size;
c->sb.nr_in_set = sb->nr_in_set;
c->sb.last_mount = sb->last_mount;
c->sb.version = sb->version;
if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
c->sb.feature_compat = sb->feature_compat;
c->sb.feature_ro_compat = sb->feature_ro_compat;
c->sb.feature_incompat = sb->feature_incompat;
}
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
atomic_set(&c->attached_dev_nr, 0);
c->btree_pages = bucket_pages(c);
c->btree_pages = meta_bucket_pages(&c->sb);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
BTREE_MAX_PAGES);
@ -1845,24 +1915,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
iter_size = (sb->bucket_size / sb->block_size + 1) *
iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
sizeof(struct btree_iter_set);
if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
mempool_init_kmalloc_pool(&c->bio_meta, 2,
sizeof(struct bbio) + sizeof(struct bio_vec) *
bucket_pages(c)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
WQ_MEM_RECLAIM, 0)) ||
bch_journal_alloc(c) ||
bch_btree_cache_alloc(c) ||
bch_open_buckets_alloc(c) ||
bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
if (!c->devices)
goto err;
if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
goto err;
if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
sizeof(struct bbio) +
sizeof(struct bio_vec) * meta_bucket_pages(&c->sb)))
goto err;
if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
goto err;
if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto err;
c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb);
if (!c->uuids)
goto err;
c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
if (!c->moving_gc_wq)
goto err;
if (bch_journal_alloc(c))
goto err;
if (bch_btree_cache_alloc(c))
goto err;
if (bch_open_buckets_alloc(c))
goto err;
if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
goto err;
c->congested_read_threshold_us = 2000;
@ -2107,7 +2199,14 @@ found:
sysfs_create_link(&c->kobj, &ca->kobj, buf))
goto err;
if (ca->sb.seq > c->sb.seq) {
/*
* A special case is both ca->sb.seq and c->sb.seq are 0,
* such condition happens on a new created cache device whose
* super block is never flushed yet. In this case c->sb.version
* and other members should be updated too, otherwise we will
* have a mistaken super block version in cache set.
*/
if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
c->sb.version = ca->sb.version;
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
c->sb.flags = ca->sb.flags;
@ -2145,7 +2244,7 @@ void bch_cache_release(struct kobject *kobj)
ca->set->cache[ca->sb.nr_this_dev] = NULL;
}
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
kfree(ca->prio_buckets);
vfree(ca->buckets);
@ -2242,7 +2341,7 @@ static int cache_alloc(struct cache *ca)
goto err_prio_buckets_alloc;
}
ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
if (!ca->disk_buckets) {
err = "ca->disk_buckets alloc failed";
goto err_disk_buckets_alloc;
@ -2789,7 +2888,7 @@ static int __init bcache_init(void)
static const struct attribute *files[] = {
&ksysfs_register.attr,
&ksysfs_register_quiet.attr,
#ifdef CONFIG_BCACHE_ASYNC_REGISTRAION
#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
&ksysfs_register_async.attr,
#endif
&ksysfs_pendings_cleanup.attr,

View File

@ -11,6 +11,7 @@
#include "btree.h"
#include "request.h"
#include "writeback.h"
#include "features.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
@ -88,6 +89,9 @@ read_attribute(btree_used_percent);
read_attribute(average_key_size);
read_attribute(dirty_data);
read_attribute(bset_tree_stats);
read_attribute(feature_compat);
read_attribute(feature_ro_compat);
read_attribute(feature_incompat);
read_attribute(state);
read_attribute(cache_read_races);
@ -779,6 +783,13 @@ SHOW(__bch_cache_set)
if (attr == &sysfs_bset_tree_stats)
return bch_bset_print_stats(c, buf);
if (attr == &sysfs_feature_compat)
return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE);
if (attr == &sysfs_feature_ro_compat)
return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE);
if (attr == &sysfs_feature_incompat)
return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE);
return 0;
}
SHOW_LOCKED(bch_cache_set)
@ -987,6 +998,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_io_disable,
&sysfs_cutoff_writeback,
&sysfs_cutoff_writeback_sync,
&sysfs_feature_compat,
&sysfs_feature_ro_compat,
&sysfs_feature_incompat,
NULL
};
KTYPE(bch_cache_set_internal);

View File

@ -459,10 +459,8 @@ static void read_dirty(struct cached_dev *dc)
for (i = 0; i < nk; i++) {
w = keys[i];
io = kzalloc(sizeof(struct dirty_io) +
sizeof(struct bio_vec) *
DIV_ROUND_UP(KEY_SIZE(&w->key),
PAGE_SECTORS),
io = kzalloc(struct_size(io, bio.bi_inline_vecs,
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
GFP_KERNEL);
if (!io)
goto err;
@ -523,15 +521,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
uint64_t offset, int nr_sectors)
{
struct bcache_device *d = c->devices[inode];
unsigned int stripe_offset, stripe, sectors_dirty;
unsigned int stripe_offset, sectors_dirty;
int stripe;
if (!d)
return;
stripe = offset_to_stripe(d, offset);
if (stripe < 0)
return;
if (UUID_FLASH_ONLY(&c->uuids[inode]))
atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
while (nr_sectors) {
@ -571,12 +573,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
static void refill_full_stripes(struct cached_dev *dc)
{
struct keybuf *buf = &dc->writeback_keys;
unsigned int start_stripe, stripe, next_stripe;
unsigned int start_stripe, next_stripe;
int stripe;
bool wrapped = false;
stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
if (stripe >= dc->disk.nr_stripes)
if (stripe < 0)
stripe = 0;
start_stripe = stripe;
@ -825,10 +827,8 @@ static int bch_dirty_init_thread(void *arg)
struct btree_iter iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
int i;
k = p = NULL;
i = 0;
cur_idx = prev_idx = 0;
bch_btree_iter_init(&c->root->keys, &iter, NULL);

View File

@ -52,10 +52,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
static inline unsigned int offset_to_stripe(struct bcache_device *d,
static inline int offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
do_div(offset, d->stripe_size);
/* d->nr_stripes is in range [1, INT_MAX] */
if (unlikely(offset >= d->nr_stripes)) {
pr_err("Invalid stripe %llu (>= nr_stripes %d).\n",
offset, d->nr_stripes);
return -EINVAL;
}
/*
* Here offset is definitly smaller than INT_MAX,
* return it as int will never overflow.
*/
return offset;
}
@ -63,7 +75,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
uint64_t offset,
unsigned int nr_sectors)
{
unsigned int stripe = offset_to_stripe(&dc->disk, offset);
int stripe = offset_to_stripe(&dc->disk, offset);
if (stripe < 0)
return false;
while (1) {
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))

View File

@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
s += blocks;
}
bitmap->last_end_sync = jiffies;
sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
EXPORT_SYMBOL(md_bitmap_cond_end_sync);

View File

@ -1518,6 +1518,7 @@ static void unlock_all_bitmaps(struct mddev *mddev)
}
}
kfree(cinfo->other_bitmap_lockres);
cinfo->other_bitmap_lockres = NULL;
}
}

View File

@ -101,6 +101,8 @@ static void mddev_detach(struct mddev *mddev);
* count by 2 for every hour elapsed between read errors.
*/
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
* is 1000 KB/sec, so the extra system load does not show up that much.
@ -463,12 +465,38 @@ check_suspended:
}
EXPORT_SYMBOL(md_handle_request);
struct md_io {
struct mddev *mddev;
bio_end_io_t *orig_bi_end_io;
void *orig_bi_private;
unsigned long start_time;
};
static void md_end_io(struct bio *bio)
{
struct md_io *md_io = bio->bi_private;
struct mddev *mddev = md_io->mddev;
disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time);
bio->bi_end_io = md_io->orig_bi_end_io;
bio->bi_private = md_io->orig_bi_private;
mempool_free(md_io, &mddev->md_io_pool);
if (bio->bi_end_io)
bio->bi_end_io(bio);
}
static blk_qc_t md_submit_bio(struct bio *bio)
{
const int rw = bio_data_dir(bio);
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = bio->bi_disk->private_data;
unsigned int sectors;
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
return BLK_QC_T_NONE;
}
if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
bio_io_error(bio);
@ -477,10 +505,6 @@ static blk_qc_t md_submit_bio(struct bio *bio)
blk_queue_split(&bio);
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio);
return BLK_QC_T_NONE;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
@ -488,21 +512,27 @@ static blk_qc_t md_submit_bio(struct bio *bio)
return BLK_QC_T_NONE;
}
/*
* save the sectors now since our bio can
* go away inside make_request
*/
sectors = bio_sectors(bio);
if (bio->bi_end_io != md_end_io) {
struct md_io *md_io;
md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO);
md_io->mddev = mddev;
md_io->orig_bi_end_io = bio->bi_end_io;
md_io->orig_bi_private = bio->bi_private;
bio->bi_end_io = md_end_io;
bio->bi_private = md_io;
md_io->start_time = disk_start_io_acct(mddev->gendisk,
bio_sectors(bio),
bio_op(bio));
}
/* bio could be mergeable after passing to underlayer */
bio->bi_opf &= ~REQ_NOMERGE;
md_handle_request(mddev, bio);
part_stat_lock();
part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
}
@ -928,7 +958,8 @@ static void super_written(struct bio *bio)
struct mddev *mddev = rdev->mddev;
if (bio->bi_status) {
pr_err("md: super_written gets error=%d\n", bio->bi_status);
pr_err("md: %s gets error=%d\n", __func__,
blk_status_to_errno(bio->bi_status));
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
@ -2143,6 +2174,24 @@ retry:
sb->sb_csum = calc_sb_1_csum(sb);
}
static sector_t super_1_choose_bm_space(sector_t dev_size)
{
sector_t bm_space;
/* if the device is bigger than 8Gig, save 64k for bitmap
* usage, if bigger than 200Gig, save 128k
*/
if (dev_size < 64*2)
bm_space = 0;
else if (dev_size - 64*2 >= 200*1024*1024*2)
bm_space = 128*2;
else if (dev_size - 4*2 > 8*1024*1024*2)
bm_space = 64*2;
else
bm_space = 4*2;
return bm_space;
}
static unsigned long long
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
{
@ -2163,13 +2212,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
return 0;
} else {
/* minor version 0; superblock after data */
sector_t sb_start;
sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
sector_t sb_start, bm_space;
sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
/* 8K is for superblock */
sb_start = dev_size - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
max_sectors = rdev->sectors + sb_start - rdev->sb_start;
bm_space = super_1_choose_bm_space(dev_size);
/* Space that can be used to store date needs to decrease
* superblock bitmap space and bad block space(4K)
*/
max_sectors = sb_start - bm_space - 4*2;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
}
sb = page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
@ -2421,9 +2479,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
goto fail;
ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
if (sysfs_create_link(&rdev->kobj, ko, "block"))
/* failure here is OK */;
/* failure here is OK */
err = sysfs_create_link(&rdev->kobj, ko, "block");
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
rdev->sysfs_unack_badblocks =
sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
rdev->sysfs_badblocks =
sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
list_add_rcu(&rdev->same_set, &mddev->disks);
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
@ -2457,7 +2519,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
sysfs_put(rdev->sysfs_unack_badblocks);
sysfs_put(rdev->sysfs_badblocks);
rdev->sysfs_state = NULL;
rdev->sysfs_unack_badblocks = NULL;
rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
@ -2802,7 +2868,7 @@ rewrite:
goto repeat;
wake_up(&mddev->sb_wait);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
rdev_for_each(rdev, mddev) {
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
@ -3182,8 +3248,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
/* failure here is OK */;
sysfs_link_rdev(rdev->mddev, rdev);
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@ -4055,7 +4121,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify(&mddev->kobj, NULL, "level");
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event(mddev);
rv = len;
out_unlock:
@ -4167,6 +4233,14 @@ out_unlock:
static struct md_sysfs_entry md_raid_disks =
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
static ssize_t
uuid_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%pU\n", mddev->uuid);
}
static struct md_sysfs_entry md_uuid =
__ATTR(uuid, S_IRUGO, uuid_show, NULL);
static ssize_t
chunk_size_show(struct mddev *mddev, char *page)
{
@ -4808,7 +4882,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
}
if (err)
return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
} else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@ -5423,6 +5497,7 @@ static struct attribute *md_default_attrs[] = {
&md_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
&md_chunk_size.attr,
&md_size.attr,
&md_resync_start.attr,
@ -5514,6 +5589,13 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
if (mddev->sysfs_completed)
sysfs_put(mddev->sysfs_completed);
if (mddev->sysfs_degraded)
sysfs_put(mddev->sysfs_degraded);
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
if (mddev->gendisk)
del_gendisk(mddev->gendisk);
@ -5525,6 +5607,7 @@ static void md_free(struct kobject *ko)
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
mempool_exit(&mddev->md_io_pool);
kfree(mddev);
}
@ -5620,6 +5703,11 @@ static int md_alloc(dev_t dev, char *name)
*/
mddev->hold_active = UNTIL_STOP;
error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
sizeof(struct md_io));
if (error)
goto abort;
error = -ENOMEM;
mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
if (!mddev->queue)
@ -5676,6 +5764,9 @@ static int md_alloc(dev_t dev, char *name)
if (!error && mddev->kobj.sd) {
kobject_uevent(&mddev->kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
}
mddev_put(mddev);
return error;
@ -5961,7 +6052,7 @@ int md_run(struct mddev *mddev)
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
mddev->in_sync = 1;
smp_wmb();
spin_lock(&mddev->lock);
@ -6028,7 +6119,7 @@ static int do_md_run(struct mddev *mddev)
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
sysfs_notify_dirent_safe(mddev->sysfs_state);
sysfs_notify_dirent_safe(mddev->sysfs_action);
sysfs_notify(&mddev->kobj, NULL, "degraded");
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
out:
clear_bit(MD_NOT_READY, &mddev->flags);
return err;
@ -7339,6 +7430,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.nodes = 0;
md_cluster_ops->leave(mddev);
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
mddev_suspend(mddev);
md_bitmap_destroy(mddev);
@ -8330,6 +8423,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
int md_setup_cluster(struct mddev *mddev, int nodes)
{
int ret;
if (!md_cluster_ops)
request_module("md-cluster");
spin_lock(&pers_lock);
@ -8341,7 +8435,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
}
spin_unlock(&pers_lock);
return md_cluster_ops->join(mddev, nodes);
ret = md_cluster_ops->join(mddev, nodes);
if (!ret)
mddev->safemode_delay = 0;
return ret;
}
void md_cluster_stop(struct mddev *mddev)
@ -8742,7 +8839,7 @@ void md_do_sync(struct md_thread *thread)
} else
mddev->curr_resync = 3; /* no longer delayed */
mddev->curr_resync_completed = j;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
md_new_event(mddev);
update_time = jiffies;
@ -8770,7 +8867,7 @@ void md_do_sync(struct md_thread *thread)
mddev->recovery_cp = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
while (j >= mddev->resync_max &&
@ -8877,7 +8974,7 @@ void md_do_sync(struct md_thread *thread)
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->curr_resync > 3) {
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
@ -9007,7 +9104,7 @@ static int remove_and_add_spares(struct mddev *mddev,
}
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
if (this && removed)
goto no_add;
@ -9035,8 +9132,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->recovery_offset = 0;
}
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
/* failure here is OK */
sysfs_link_rdev(mddev, rdev);
if (!test_bit(Journal, &rdev->flags))
spares++;
md_new_event(mddev);
@ -9290,8 +9387,7 @@ void md_reap_sync_thread(struct mddev *mddev)
/* success...*/
/* activate any spares */
if (mddev->pers->spare_active(mddev)) {
sysfs_notify(&mddev->kobj, NULL,
"degraded");
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
}
}
@ -9381,8 +9477,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
if (rv == 0) {
/* Make sure they get written out promptly */
if (test_bit(ExternalBbl, &rdev->flags))
sysfs_notify(&rdev->kobj, NULL,
"unacknowledged_bad_blocks");
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
@ -9403,7 +9498,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
s += rdev->data_offset;
rv = badblocks_clear(&rdev->badblocks, s, sectors);
if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
return rv;
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
@ -9633,7 +9728,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL, "degraded");
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
put_page(swapout);
return 0;

View File

@ -126,7 +126,10 @@ struct md_rdev {
struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */
struct kernfs_node *sysfs_unack_badblocks;
/* handle for 'bad_blocks' sysfs dentry */
struct kernfs_node *sysfs_badblocks;
struct badblocks badblocks;
struct {
@ -420,6 +423,9 @@ struct mddev {
* file in sysfs.
*/
struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
struct kernfs_node *sysfs_level; /*handle for 'level' */
struct work_struct del_work; /* used for delayed sysfs removal */
@ -481,6 +487,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
mempool_t md_io_pool;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit

View File

@ -955,6 +955,7 @@ static void wait_barrier(struct r10conf *conf)
{
spin_lock_irq(&conf->resync_lock);
if (conf->barrier) {
struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
@ -969,9 +970,16 @@ static void wait_barrier(struct r10conf *conf)
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(atomic_read(&conf->nr_pending) &&
current->bio_list &&
(!bio_list_empty(&current->bio_list[0]) ||
!bio_list_empty(&current->bio_list[1]))),
bio_list &&
(!bio_list_empty(&bio_list[0]) ||
!bio_list_empty(&bio_list[1]))) ||
/* move on if recovery thread is
* blocked by us
*/
(conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
conf->resync_lock);
conf->nr_waiting--;
if (!conf->nr_waiting)
@ -4282,8 +4290,8 @@ out:
else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
/* Failure here is OK */;
/* Failure here is OK */
sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@ -4429,7 +4437,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
sector_nr = conf->reshape_progress;
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
return sector_nr;
}

View File

@ -195,9 +195,7 @@ struct r5l_log {
static inline sector_t r5c_tree_index(struct r5conf *conf,
sector_t sect)
{
sector_t offset;
offset = sector_div(sect, conf->chunk_sectors);
sector_div(sect, conf->chunk_sectors);
return sect;
}
@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
}
@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> STRIPE_SHIFT))
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
r5l_wake_reclaim(conf->log, 0);
}
@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
struct stripe_head *sh, *next;
bool cleared_pending = false;
if (ctx->data_only_stripes == 0)
return;
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
cleared_pending = true;
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
}
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
atomic_read(&conf->active_stripes) == 0);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
if (cleared_pending)
set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
}
static int r5l_recovery_log(struct r5l_log *log)
@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
struct r5conf *conf;
int ret;
ret = mddev_lock(mddev);
if (ret)
return ret;
spin_lock(&mddev->lock);
conf = mddev->private;
if (!conf || !conf->log) {
mddev_unlock(mddev);
spin_unlock(&mddev->lock);
return 0;
}
@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
default:
ret = 0;
}
mddev_unlock(mddev);
spin_unlock(&mddev->lock);
return ret;
}

View File

@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
* be just after the last logged stripe and write to the same
* disks. Use bit shift and logarithm to avoid 64-bit division.
*/
if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) &&
(data_sector >> ilog2(conf->chunk_sectors) ==
data_sector_last >> ilog2(conf->chunk_sectors)) &&
((data_sector - data_sector_last) * data_disks ==
@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
/* if start and end is 4k aligned, use a 4k block */
if (block_size == 512 &&
(r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
(r_sector_last & (STRIPE_SECTORS - 1)) == 0)
block_size = STRIPE_SIZE;
(r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 &&
(r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0)
block_size = RAID5_STRIPE_SIZE(conf);
/* iterate through blocks in strip */
for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
if (ppl_data_sectors > 0)
ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
ppl_data_sectors = rounddown(ppl_data_sectors,
RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private));
if (ppl_data_sectors <= 0) {
pr_warn("md/raid:%s: PPL space too small on %s\n",

View File

@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq;
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
return &conf->stripe_hashtbl[hash];
}
static inline int stripe_hash_locks_hash(sector_t sect)
static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
{
return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
int hash = stripe_hash_locks_hash(conf, sector);
int inc_empty_inactive_list_flag;
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
tmp_sec = sh->sector;
if (!sector_div(tmp_sec, conf->chunk_sectors))
return;
head_sector = sh->sector - STRIPE_SECTORS;
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
hash = stripe_hash_locks_hash(head_sector);
hash = stripe_hash_locks_hash(conf, head_sector);
spin_lock_irq(conf->hash_locks + hash);
head = __find_stripe(conf, head_sector, conf->generation);
if (head && !atomic_inc_not_zero(&head->count)) {
@ -1057,7 +1057,7 @@ again:
test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad)
break;
@ -1089,7 +1089,7 @@ again:
if (rdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@ -1129,9 +1129,9 @@ again:
else
sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_iter.bi_size = STRIPE_SIZE;
bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
bi->bi_write_hint = sh->dev[i].write_hint;
if (!rrdev)
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
@ -1156,7 +1156,7 @@ again:
if (rrdev) {
if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
set_bit(STRIPE_IO_STARTED, &sh->state);
@ -1183,9 +1183,9 @@ again:
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_iter.bi_size = STRIPE_SIZE;
rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
rbi->bi_write_hint = sh->dev[i].write_hint;
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
/*
@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
int page_offset;
struct async_submit_ctl submit;
enum async_tx_flags flags = 0;
struct r5conf *conf = sh->raid_conf;
if (bio->bi_iter.bi_sector >= sector)
page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
len -= b_offset;
}
if (len > 0 && page_offset + len > STRIPE_SIZE)
clen = STRIPE_SIZE - page_offset;
if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
clen = RAID5_STRIPE_SIZE(conf) - page_offset;
else
clen = len;
@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
if (frombio) {
if (sh->raid_conf->skip_copy &&
if (conf->skip_copy &&
b_offset == 0 && page_offset == 0 &&
clen == STRIPE_SIZE &&
clen == RAID5_STRIPE_SIZE(conf) &&
!no_skipcopy)
*page = bio_page;
else
@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
int i;
struct r5conf *conf = sh->raid_conf;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref)
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
rbi2 = r5_next_bio(conf, rbi, dev->sector);
bio_endio(rbi);
rbi = rbi2;
}
@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh)
struct dma_async_tx_descriptor *tx = NULL;
struct async_submit_ctl submit;
int i;
struct r5conf *conf = sh->raid_conf;
BUG_ON(sh->batch_head);
pr_debug("%s: stripe %llu\n", __func__,
@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh)
dev->toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
tx = async_copy_data(0, rbi, &dev->page,
dev->sector, tx, sh, 0);
rbi = r5_next_bio(rbi, dev->sector);
rbi = r5_next_bio(conf, rbi, dev->sector);
}
}
}
@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
tx = async_xor(xor_dest, xor_srcs, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
tx = async_gen_syndrome(blocks, 0, count+2,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
} else {
/* Compute any data- or p-drive using XOR */
count = 0;
@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
tx = async_xor(dest, blocks, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
}
return tx;
@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, syndrome_disks+2,
STRIPE_SIZE, &submit);
RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
} else {
struct page *dest;
int data_target;
@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
tx = async_xor(dest, blocks, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
ops_complete_compute, sh,
to_addr_conv(sh, percpu, 0));
return async_gen_syndrome(blocks, 0, count+2,
STRIPE_SIZE, &submit);
RAID5_STRIPE_SIZE(sh->raid_conf),
&submit);
}
} else {
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
if (failb == syndrome_disks) {
/* We're missing D+P. */
return async_raid6_datap_recov(syndrome_disks+2,
STRIPE_SIZE, faila,
blocks, &submit);
RAID5_STRIPE_SIZE(sh->raid_conf),
faila,
blocks, &submit);
} else {
/* We're missing D+D. */
return async_raid6_2data_recov(syndrome_disks+2,
STRIPE_SIZE, faila, failb,
blocks, &submit);
RAID5_STRIPE_SIZE(sh->raid_conf),
faila, failb,
blocks, &submit);
}
}
}
@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
tx = async_xor(xor_dest, xor_srcs, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
tx = async_gen_syndrome(blocks, 0, count+2,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
return tx;
}
@ -1752,7 +1766,7 @@ again:
WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
if (wbi->bi_opf & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_opf & REQ_SYNC)
@ -1770,7 +1784,7 @@ again:
clear_bit(R5_OVERWRITE, &dev->flags);
}
}
wbi = r5_next_bio(wbi, dev->sector);
wbi = r5_next_bio(conf, wbi, dev->sector);
}
if (head_sh->batch_head) {
@ -1910,9 +1924,11 @@ again:
}
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
tx = async_xor(xor_dest, xor_srcs, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@ -1972,7 +1988,8 @@ again:
} else
init_async_submit(&submit, 0, tx, NULL, NULL,
to_addr_conv(sh, percpu, j));
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
tx = async_gen_syndrome(blocks, 0, count+2,
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
if (!last_stripe) {
j++;
sh = list_first_entry(&sh->batch_list, struct stripe_head,
@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
init_async_submit(&submit, 0, NULL, NULL, NULL,
to_addr_conv(sh, percpu, 0));
tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
tx = async_xor_val(xor_dest, xor_srcs, 0, count,
RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, &submit);
atomic_inc(&sh->count);
@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
atomic_inc(&sh->count);
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
sh, to_addr_conv(sh, percpu, 0));
async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
async_syndrome_val(srcs, 0, count+2,
RAID5_STRIPE_SIZE(sh->raid_conf),
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
@ -2217,9 +2236,9 @@ static int grow_stripes(struct r5conf *conf, int num)
/**
* scribble_alloc - allocate percpu scribble buffer for required size
* of the scribble region
* @percpu - from for_each_present_cpu() of the caller
* @num - total number of disks in the array
* @cnt - scribble objs count for required size of the scribble region
* @percpu: from for_each_present_cpu() of the caller
* @num: total number of disks in the array
* @cnt: scribble objs count for required size of the scribble region
*
* The scribble buffer size must be enough to contain:
* 1/ a struct page pointer for each device in the array +2
@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
percpu = per_cpu_ptr(conf->percpu, cpu);
err = scribble_alloc(percpu, new_disks,
new_sectors / STRIPE_SECTORS);
new_sectors / RAID5_STRIPE_SECTORS(conf));
if (err)
break;
}
@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi)
*/
pr_info_ratelimited(
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS,
mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
(unsigned long long)s,
bdevname(rdev->bdev, b));
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi)
if (!(set_bad
&& test_bit(In_sync, &rdev->flags)
&& rdev_set_badblocks(
rdev, sh->sector, STRIPE_SECTORS, 0)))
rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
md_error(conf->mddev, rdev);
}
}
@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi)
if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
/* check if page is covered */
sector_t sector = sh->dev[dd_idx].sector;
for (bi=sh->dev[dd_idx].towrite;
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
bi && bi->bi_iter.bi_sector <= sector;
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
if (bio_end_sector(bi) >= sector)
sector = bio_end_sector(bi);
}
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
sh->overwrite_disks++;
}
@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0);
RAID5_STRIPE_SECTORS(conf), 0);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (!rdev_set_badblocks(
rdev,
sh->sector,
STRIPE_SECTORS, 0))
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
RAID5_STRIPE_SECTORS(conf), 0, 0);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
md_write_end(conf->mddev);
bio_io_error(bi);
@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
r5_next_bio(conf, bi, sh->dev[i].sector);
bio_io_error(bi);
bi = nextbi;
@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
if (bitmap_end)
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
RAID5_STRIPE_SECTORS(conf), 0, 0);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0))
RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0))
RAID5_STRIPE_SECTORS(conf), 0))
abort = 1;
}
rcu_read_unlock();
@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
conf->recovery_disabled =
conf->mddev->recovery_disabled;
}
md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
}
static int want_replace(struct stripe_head *sh, int disk_idx)
@ -3538,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
&sh->dev[s->failed_num[1]] };
int i;
bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
if (test_bit(R5_LOCKED, &dev->flags) ||
@ -3596,17 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
* devices must be read.
*/
return 1;
if (s->failed >= 2 &&
(fdev[i]->towrite ||
s->failed_num[i] == sh->pd_idx ||
s->failed_num[i] == sh->qd_idx) &&
!test_bit(R5_UPTODATE, &fdev[i]->flags))
/* In max degraded raid6, If the failed disk is P, Q,
* or we want to read the failed disk, we need to do
* reconstruct-write.
*/
force_rcw = true;
}
/* If we are forced to do a reconstruct-write, either because
* the current RAID6 implementation only supports that, or
* because parity cannot be trusted and we are currently
* recovering it, there is extra need to be careful.
/* If we are forced to do a reconstruct-write, because parity
* cannot be trusted and we are currently recovering it, there
* is extra need to be careful.
* If one of the devices that we would need to read, because
* it is not being overwritten (and maybe not written at all)
* is missing/faulty, then we need to read everything we can.
*/
if (sh->raid_conf->level != 6 &&
if (!force_rcw &&
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
@ -3710,7 +3740,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
return 0;
}
/**
/*
* handle_stripe_fill - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill(struct stripe_head *sh,
@ -3785,14 +3815,14 @@ returnbi:
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
wbi2 = r5_next_bio(conf, wbi, dev->sector);
md_write_end(conf->mddev);
bio_endio(wbi);
wbi = wbi2;
}
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
RAID5_STRIPE_SECTORS(conf),
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
if (head_sh->batch_head) {
@ -3976,10 +4006,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
} else {
} else
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
}
@ -4004,10 +4032,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(R5_Wantread, &dev->flags);
s->locked++;
qread++;
} else {
} else
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
if (rcw && conf->mddev->queue)
@ -4099,7 +4125,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@ -4107,7 +4133,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
STRIPE_SECTORS);
RAID5_STRIPE_SECTORS(conf));
} else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
@ -4264,7 +4290,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
*/
}
} else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
@ -4272,7 +4298,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
"%llu-%llu\n", mdname(conf->mddev),
(unsigned long long) sh->sector,
(unsigned long long) sh->sector +
STRIPE_SECTORS);
RAID5_STRIPE_SECTORS(conf));
} else {
int *target = &sh->ops.target;
@ -4343,7 +4369,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
/* place all the copies on one channel */
init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
tx = async_memcpy(sh2->dev[dd_idx].page,
sh->dev[i].page, 0, 0, STRIPE_SIZE,
sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
&submit);
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
@ -4442,8 +4468,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
*/
rdev = rcu_dereference(conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
!is_badblock(rdev, sh->sector, STRIPE_SECTORS,
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags);
else {
@ -4457,7 +4483,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
@ -4484,7 +4510,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
}
} else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
/* in sync if before recovery_offset */
set_bit(R5_Insync, &dev->flags);
else if (test_bit(R5_UPTODATE, &dev->flags) &&
@ -4573,12 +4599,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rcu_read_unlock();
}
/*
* Return '1' if this is a member of batch, or '0' if it is a lone stripe or
* a head which can now be handled.
*/
static int clear_batch_ready(struct stripe_head *sh)
{
/* Return '1' if this is a member of batch, or
* '0' if it is a lone stripe or a head which can now be
* handled.
*/
struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
return (sh->batch_head && sh->batch_head != sh);
@ -4682,6 +4708,16 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *pdev, *qdev;
clear_bit(STRIPE_HANDLE, &sh->state);
/*
* handle_stripe should not continue handle the batched stripe, only
* the head of batch list or lone stripe can continue. Otherwise we
* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
* is set for the batched stripe.
*/
if (clear_batch_ready(sh))
return;
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
/* already being handled, ensure it gets handled
* again when current action finishes */
@ -4689,11 +4725,6 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
if (clear_batch_ready(sh) ) {
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
return;
}
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
@ -4842,7 +4873,7 @@ static void handle_stripe(struct stripe_head *sh)
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed)
|| (s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
@ -4927,7 +4958,7 @@ static void handle_stripe(struct stripe_head *sh)
if ((s.syncing || s.replacing) && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
wake_up(&conf->wait_for_overlap);
@ -4946,14 +4977,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
} else {
} else
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
}
@ -4995,7 +5023,7 @@ static void handle_stripe(struct stripe_head *sh)
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
if (s.expanding && s.locked == 0 &&
@ -5025,14 +5053,14 @@ finish:
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0))
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0);
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@ -5041,7 +5069,7 @@ finish:
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0);
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
@ -5483,7 +5511,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
/* Skip discard while reshape is happening */
return;
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
@ -5498,7 +5526,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
last_sector *= conf->chunk_sectors;
for (; logical_sector < last_sector;
logical_sector += STRIPE_SECTORS) {
logical_sector += RAID5_STRIPE_SECTORS(conf)) {
DEFINE_WAIT(w);
int d;
again:
@ -5543,7 +5571,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
d++)
md_bitmap_startwrite(mddev->bitmap,
sh->sector,
STRIPE_SECTORS,
RAID5_STRIPE_SECTORS(conf),
0);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
@ -5608,12 +5636,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
return true;
}
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
int previous;
int seq;
@ -5711,8 +5739,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
do_flush = false;
}
if (!sh->batch_head || sh == sh->batch_head)
set_bit(STRIPE_HANDLE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
if ((!sh->batch_head || sh == sh->batch_head) &&
(bi->bi_opf & REQ_SYNC) &&
@ -5777,7 +5804,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_div(sector_nr, new_data_disks);
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
*skipped = 1;
retn = sector_nr;
goto finish;
@ -5891,11 +5918,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j;
int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
@ -5916,7 +5943,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
skipped_disk = 1;
continue;
}
memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
set_bit(R5_Expanded, &sh->dev[j].flags);
set_bit(R5_UPTODATE, &sh->dev[j].flags);
}
@ -5951,7 +5978,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
first_sector += RAID5_STRIPE_SECTORS(conf);
}
/* Now that the sources are clearly marked, we can release
* the destination stripes
@ -5998,7 +6025,7 @@ finish:
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
return retn;
@ -6057,11 +6084,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
sync_blocks >= STRIPE_SECTORS) {
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
sync_blocks /= STRIPE_SECTORS;
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
*skipped = 1;
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
/* keep things rounded to whole stripes */
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
@ -6094,7 +6122,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
raid5_release_stripe(sh);
return STRIPE_SECTORS;
return RAID5_STRIPE_SECTORS(conf);
}
static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
@ -6117,14 +6145,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
int handled = 0;
logical_sector = raid_bio->bi_iter.bi_sector &
~((sector_t)STRIPE_SECTORS-1);
~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
logical_sector += STRIPE_SECTORS,
sector += STRIPE_SECTORS,
logical_sector += RAID5_STRIPE_SECTORS(conf),
sector += RAID5_STRIPE_SECTORS(conf),
scnt++) {
if (scnt < offset)
@ -6457,6 +6485,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
raid5_show_rmw_level,
raid5_store_rmw_level);
static ssize_t
raid5_show_stripe_size(struct mddev *mddev, char *page)
{
struct r5conf *conf;
int ret = 0;
spin_lock(&mddev->lock);
conf = mddev->private;
if (conf)
ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
spin_unlock(&mddev->lock);
return ret;
}
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
static ssize_t
raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
{
struct r5conf *conf;
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
if (kstrtoul(page, 10, &new))
return -EINVAL;
/*
* The value should not be bigger than PAGE_SIZE. It requires to
* be multiple of DEFAULT_STRIPE_SIZE.
*/
if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf) {
err = -ENODEV;
goto out_unlock;
}
if (new == conf->stripe_size)
goto out_unlock;
pr_debug("md/raid: change stripe_size from %lu to %lu\n",
conf->stripe_size, new);
mddev_suspend(mddev);
conf->stripe_size = new;
conf->stripe_shift = ilog2(new) - 9;
conf->stripe_sectors = new >> 9;
mddev_resume(mddev);
out_unlock:
mddev_unlock(mddev);
return err ?: len;
}
static struct md_sysfs_entry
raid5_stripe_size = __ATTR(stripe_size, 0644,
raid5_show_stripe_size,
raid5_store_stripe_size);
#else
static struct md_sysfs_entry
raid5_stripe_size = __ATTR(stripe_size, 0444,
raid5_show_stripe_size,
NULL);
#endif
static ssize_t
raid5_show_preread_threshold(struct mddev *mddev, char *page)
@ -6645,6 +6744,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_group_thread_cnt.attr,
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
&raid5_stripe_size.attr,
&r5c_journal_mode.attr,
&ppl_write_hint.attr,
NULL,
@ -6744,7 +6844,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
conf->previous_raid_disks),
max(conf->chunk_sectors,
conf->prev_chunk_sectors)
/ STRIPE_SECTORS)) {
/ RAID5_STRIPE_SECTORS(conf))) {
free_scratch_buffer(conf, percpu);
return -ENOMEM;
}
@ -6896,6 +6996,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
if (conf == NULL)
goto abort;
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
conf->stripe_size = DEFAULT_STRIPE_SIZE;
conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
#endif
INIT_LIST_HEAD(&conf->free_list);
INIT_LIST_HEAD(&conf->pending_list);
conf->pending_data = kcalloc(PENDING_IO_MAX,
@ -7047,8 +7153,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->min_nr_stripes = NR_STRIPES;
if (mddev->reshape_position != MaxSector) {
int stripes = max_t(int,
((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
conf->min_nr_stripes = max(NR_STRIPES, stripes);
if (conf->min_nr_stripes != NR_STRIPES)
pr_info("md/raid:%s: force stripe size %d for reshape\n",
@ -7779,14 +7885,14 @@ static int check_stripe_cache(struct mddev *mddev)
* stripe_heads first.
*/
struct r5conf *conf = mddev->private;
if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes ||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
> conf->min_nr_stripes) {
pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
mdname(mddev),
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
/ STRIPE_SIZE)*4);
/ RAID5_STRIPE_SIZE(conf))*4);
return 0;
}
return 1;
@ -7922,8 +8028,8 @@ static int raid5_start_reshape(struct mddev *mddev)
else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
/* Failure here is OK */;
/* Failure here is OK */
sysfs_link_rdev(mddev, rdev);
}
} else if (rdev->raid_disk >= conf->previous_raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
@ -8118,7 +8224,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
while (chunksect && (mddev->array_sectors & (chunksect-1)))
chunksect >>= 1;
if ((chunksect<<9) < STRIPE_SIZE)
if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
/* array size does not allow a suitable chunk size */
return ERR_PTR(-EINVAL);

View File

@ -472,32 +472,20 @@ struct disk_info {
*/
#define NR_STRIPES 256
#define DEFAULT_STRIPE_SIZE 4096
#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
#endif
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
* When walking this list for a particular stripe+device, we must never proceed
* beyond a bio that extends past this device, as the next bio might no longer
* be valid.
* This function is used to determine the 'next' bio in the list, given the
* sector of the current stripe+device
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
return bio->bi_next;
else
return NULL;
}
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
@ -574,6 +562,11 @@ struct r5conf {
int raid_disks;
int max_nr_stripes;
int min_nr_stripes;
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
unsigned long stripe_size;
unsigned int stripe_shift;
unsigned long stripe_sectors;
#endif
/* reshape_progress is the leading edge of a 'reshape'
* It has value MaxSector when no reshape is happening
@ -690,6 +683,32 @@ struct r5conf {
struct r5pending_data *next_pending_data;
};
#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE
#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT
#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS
#else
#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size)
#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift)
#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors)
#endif
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
* When walking this list for a particular stripe+device, we must never proceed
* beyond a bio that extends past this device, as the next bio might no longer
* be valid.
* This function is used to determine the 'next' bio in the list, given the
* sector of the current stripe+device
*/
static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
{
if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
return bio->bi_next;
else
return NULL;
}
/*
* Our supported algorithms

View File

@ -13,6 +13,7 @@ nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o

View File

@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
static int nvme_revalidate_disk(struct gendisk *disk);
static int _nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
* Revalidating a dead namespace sets capacity to 0. This will end
* buffered writers dirtying pages that can't be synced.
*/
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
nvme_retry_req(req);
return;
}
} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
req_op(req) == REQ_OP_ZONE_APPEND) {
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
}
nvme_trace_bio_complete(req, status);
@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
break;
}
break;
case NVME_CTRL_DELETING_NOIO:
switch (old_state) {
case NVME_CTRL_DELETING:
case NVME_CTRL_DEAD:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
case NVME_CTRL_CONNECTING:
return false;
case NVME_CTRL_DELETING:
case NVME_CTRL_DELETING_NOIO:
case NVME_CTRL_DEAD:
return true;
default:
@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
static void nvme_put_ns(struct nvme_ns *ns)
void nvme_put_ns(struct nvme_ns *ns)
{
kref_put(&ns->kref, nvme_free_ns);
}
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
static inline void nvme_clear_nvme_request(struct request *req)
{
@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
goto out_disable_stream;
}
ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
}
static void nvme_setup_passthrough(struct request *req,
struct nvme_command *cmd)
{
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
/* passthru commands should let the driver set the SGL flags */
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
}
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
struct request *req, struct nvme_command *cmnd,
enum nvme_opcode op)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.opcode = op;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
nvme_setup_passthrough(req, cmd);
break;
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_ZONE_RESET:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
break;
case REQ_OP_ZONE_OPEN:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
break;
case REQ_OP_ZONE_CLOSE:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
break;
case REQ_OP_ZONE_FINISH:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
break;
case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
break;
case REQ_OP_WRITE:
ret = nvme_setup_rw(ns, req, cmd);
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
break;
case REQ_OP_ZONE_APPEND:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
default:
WARN_ON_ONCE(1);
@ -884,6 +929,120 @@ out:
return ERR_PTR(ret);
}
static u32 nvme_known_admin_effects(u8 opcode)
{
switch (opcode) {
case nvme_admin_format_nvm:
return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
NVME_CMD_EFFECTS_CSE_MASK;
case nvme_admin_sanitize_nvm:
return NVME_CMD_EFFECTS_CSE_MASK;
default:
break;
}
return 0;
}
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
{
u32 effects = 0;
if (ns) {
if (ns->head->effects)
effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
dev_warn(ctrl->device,
"IO command:%02x has unhandled effects:%08x\n",
opcode, effects);
return 0;
}
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
effects |= nvme_known_admin_effects(opcode);
return effects;
}
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode)
{
u32 effects = nvme_command_effects(ctrl, ns, opcode);
/*
* For simplicity, IO to all namespaces is quiesced even if the command
* effects say only one namespace is affected.
*/
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
mutex_lock(&ctrl->scan_lock);
mutex_lock(&ctrl->subsys->lock);
nvme_mpath_start_freeze(ctrl->subsys);
nvme_mpath_wait_freeze(ctrl->subsys);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
}
return effects;
}
static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
if (_nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
else if (blk_queue_is_zoned(ns->disk->queue)) {
/*
* IO commands are required to fully revalidate a zoned
* device. Force the command effects to trigger rescan
* work so report zones can run in a context with
* unfrozen IO queues.
*/
*effects |= NVME_CMD_EFFECTS_NCC;
}
up_read(&ctrl->namespaces_rwsem);
}
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
{
/*
* Revalidate LBA changes prior to unfreezing. This is necessary to
* prevent memory corruption if a logical block size was changed by
* this command.
*/
if (effects & NVME_CMD_EFFECTS_LBCC)
nvme_update_formats(ctrl, &effects);
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_identify(ctrl);
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
flush_work(&ctrl->scan_work);
}
}
void nvme_execute_passthru_rq(struct request *rq)
{
struct nvme_command *cmd = nvme_req(rq)->cmd;
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
u32 effects;
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
blk_execute_rq(rq->q, disk, rq, 0);
nvme_passthru_end(ctrl, effects);
}
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
}
}
blk_execute_rq(req->q, disk, req, 0);
nvme_execute_passthru_rq(req);
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
return error;
}
static bool nvme_multi_css(struct nvme_ctrl *ctrl)
{
return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
}
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
struct nvme_ns_id_desc *cur)
struct nvme_ns_id_desc *cur, bool *csi_seen)
{
const char *warn_str = "ctrl returned bogus length:";
void *data = cur;
@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
uuid_copy(&ids->uuid, data + sizeof(*cur));
return NVME_NIDT_UUID_LEN;
case NVME_NIDT_CSI:
if (cur->nidl != NVME_NIDT_CSI_LEN) {
dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
warn_str, cur->nidl);
return -1;
}
memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
*csi_seen = true;
return NVME_NIDT_CSI_LEN;
default:
/* Skip unknown types */
return cur->nidl;
@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
int status;
bool csi_seen = false;
int status, pos, len;
void *data;
int pos;
int len;
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
return 0;
@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
len = nvme_process_ns_desc(ctrl, ids, cur);
len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
if (len < 0)
goto free_data;
break;
len += sizeof(*cur);
}
if (nvme_multi_css(ctrl) && !csi_seen) {
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
nsid);
status = -EINVAL;
}
free_data:
kfree(data);
return status;
@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
static u32 nvme_known_admin_effects(u8 opcode)
{
switch (opcode) {
case nvme_admin_format_nvm:
return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
NVME_CMD_EFFECTS_CSE_MASK;
case nvme_admin_sanitize_nvm:
return NVME_CMD_EFFECTS_CSE_MASK;
default:
break;
}
return 0;
}
static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode)
{
u32 effects = 0;
if (ns) {
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
dev_warn(ctrl->device,
"IO command:%02x has unhandled effects:%08x\n",
opcode, effects);
return 0;
}
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
effects |= nvme_known_admin_effects(opcode);
/*
* For simplicity, IO to all namespaces is quiesced even if the command
* effects say only one namespace is affected.
*/
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
mutex_lock(&ctrl->scan_lock);
mutex_lock(&ctrl->subsys->lock);
nvme_mpath_start_freeze(ctrl->subsys);
nvme_mpath_wait_freeze(ctrl->subsys);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
}
return effects;
}
static void nvme_update_formats(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem);
}
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
{
/*
* Revalidate LBA changes prior to unfreezing. This is necessary to
* prevent memory corruption if a logical block size was changed by
* this command.
*/
if (effects & NVME_CMD_EFFECTS_LBCC)
nvme_update_formats(ctrl);
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_identify(ctrl);
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
flush_work(&ctrl->scan_work);
}
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
u64 result;
int status;
@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
nvme_to_user_ptr(cmd.addr), cmd.data_len,
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
* Issue ioctl requests on the first available path. Note that unlike normal
* block layer requests we will not retry failed request on another controller.
*/
static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns_head **head, int *srcu_idx)
{
#ifdef CONFIG_NVME_MULTIPATH
@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
return disk->private_data;
}
static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
{
if (head)
srcu_read_unlock(&head->srcu, idx);
@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
if (ctrl->vs >= NVME_VS(1, 3, 0))
if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
return nvme_identify_ns_descs(ctrl, nsid, ids);
return 0;
}
@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
a->csi == b->csi;
}
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk,
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
ns->lba_shift = id->lbaf[lbaf].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
switch (ns->head->ids.csi) {
case NVME_CSI_NVM:
break;
case NVME_CSI_ZNS:
ret = nvme_update_zone_info(disk, ns, lbaf);
if (ret) {
dev_warn(ctrl->device,
"failed to add zoned namespace:%u ret:%d\n",
ns->head->ns_id, ret);
return ret;
}
break;
default:
dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
ns->head->ids.csi, ns->head->ns_id);
return -ENODEV;
}
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
ns->features = 0;
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@ -1977,7 +2088,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
return 0;
}
static int nvme_revalidate_disk(struct gendisk *disk)
static int _nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
@ -2025,6 +2136,28 @@ out:
return ret;
}
static int nvme_revalidate_disk(struct gendisk *disk)
{
int ret;
ret = _nvme_revalidate_disk(disk);
if (ret)
return ret;
#ifdef CONFIG_BLK_DEV_ZONED
if (blk_queue_is_zoned(disk->queue)) {
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
ret = blk_revalidate_disk_zones(disk, NULL);
if (!ret)
blk_queue_max_zone_append_sectors(disk->queue,
ctrl->max_zone_append);
}
#endif
return ret;
}
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@ -2155,6 +2288,7 @@ static const struct block_device_operations nvme_fops = {
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@ -2181,6 +2315,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.ioctl = nvme_ioctl,
.compat_ioctl = nvme_compat_ioctl,
.getgeo = nvme_getgeo,
.report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
#endif /* CONFIG_NVME_MULTIPATH */
@ -2238,12 +2373,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
unsigned dev_page_min, page_shift = 12;
unsigned dev_page_min;
int ret;
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
@ -2253,17 +2383,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
}
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
if (page_shift < dev_page_min) {
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
1 << dev_page_min, 1 << page_shift);
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
return -ENODEV;
}
ctrl->page_size = 1 << page_shift;
ctrl->ctrl_config = NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
ctrl->ctrl_config = NVME_CC_CSS_CSI;
else
ctrl->ctrl_config = NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;
@ -2313,13 +2444,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
if (ctrl->max_hw_sectors) {
u32 max_segments =
(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
max_segments = min_not_zero(max_segments, ctrl->max_segments);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
blk_queue_virt_boundary(q, ctrl->page_size - 1);
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
blk_queue_dma_alignment(q, 7);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
@ -2810,7 +2941,7 @@ out_unlock:
return ret;
}
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
@ -2824,27 +2955,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
{
struct nvme_cel *cel, *ret = NULL;
spin_lock(&ctrl->lock);
list_for_each_entry(cel, &ctrl->cels, entry) {
if (cel->csi == csi) {
ret = cel;
break;
}
}
spin_unlock(&ctrl->lock);
return ret;
}
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
int ret;
if (!ctrl->effects)
ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
if (cel)
goto out;
if (!ctrl->effects)
return 0;
cel = kzalloc(sizeof(*cel), GFP_KERNEL);
if (!cel)
return -ENOMEM;
ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
ctrl->effects, sizeof(*ctrl->effects), 0);
ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
&cel->log, sizeof(cel->log), 0);
if (ret) {
kfree(ctrl->effects);
ctrl->effects = NULL;
kfree(cel);
return ret;
}
return ret;
cel->csi = csi;
spin_lock(&ctrl->lock);
list_add_tail(&cel->entry, &ctrl->cels);
spin_unlock(&ctrl->lock);
out:
*log = &cel->log;
return 0;
}
/*
@ -2865,7 +3024,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return ret;
}
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
@ -2877,7 +3036,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
ret = nvme_get_effects_log(ctrl);
ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
if (ret < 0)
goto out_free;
}
@ -2939,7 +3098,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (id->rtd3e) {
/* us -> s */
u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
@ -3345,6 +3504,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
[NVME_CTRL_DEAD] = "dead",
};
@ -3397,6 +3557,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
if (ctrl->opts->max_reconnects == -1)
return sprintf(buf, "off\n");
return sprintf(buf, "%d\n",
opts->max_reconnects * opts->reconnect_delay);
}
static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
struct nvmf_ctrl_options *opts = ctrl->opts;
int ctrl_loss_tmo, err;
err = kstrtoint(buf, 10, &ctrl_loss_tmo);
if (err)
return -EINVAL;
else if (ctrl_loss_tmo < 0)
opts->max_reconnects = -1;
else
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
opts->reconnect_delay);
return count;
}
static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
if (ctrl->opts->reconnect_delay == -1)
return sprintf(buf, "off\n");
return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
}
static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
unsigned int v;
int err;
err = kstrtou32(buf, 10, &v);
if (err)
return err;
ctrl->opts->reconnect_delay = v;
return count;
}
static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@ -3414,6 +3634,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_sqsize.attr,
&dev_attr_hostnqn.attr,
&dev_attr_hostid.attr,
&dev_attr_ctrl_loss_tmo.attr,
&dev_attr_reconnect_delay.attr,
NULL
};
@ -3510,6 +3732,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
goto out_cleanup_srcu;
}
if (head->ids.csi) {
ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
if (ret)
goto out_cleanup_srcu;
} else
head->effects = ctrl->effects;
ret = nvme_mpath_alloc_disk(ctrl, head);
if (ret)
goto out_cleanup_srcu;
@ -3591,7 +3820,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
return nsa->head->ns_id - nsb->head->ns_id;
}
static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
@ -3609,6 +3838,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
up_read(&ctrl->namespaces_rwsem);
return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
@ -3726,7 +3956,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
if (ns->disk->flags & GENHD_FL_UP) {
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
if (blk_get_integrity(ns->disk))
@ -3757,7 +3987,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
if (ns->disk && revalidate_disk(ns->disk))
if (revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
@ -3850,8 +4080,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
log_size, 0);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
NVME_CSI_NVM, log, log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@ -3912,6 +4142,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
if (ctrl->state == NVME_CTRL_DEAD)
nvme_kill_queues(ctrl);
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
@ -3995,8 +4228,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
sizeof(*log), 0))
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
log, sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@ -4106,8 +4339,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
{
if (ctrl->kato)
nvme_start_keep_alive(ctrl);
nvme_start_keep_alive(ctrl);
nvme_enable_aen(ctrl);
@ -4133,11 +4365,16 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_ctrl *ctrl =
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
struct nvme_cel *cel, *next;
if (subsys && ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
list_del(&cel->entry);
kfree(cel);
}
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@ -4168,6 +4405,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
INIT_LIST_HEAD(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
@ -4346,6 +4584,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_sync_queues);
struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
{
struct nvme_ctrl *ctrl;
struct file *f;
f = filp_open(path, O_RDWR, 0);
if (IS_ERR(f))
return ERR_CAST(f);
if (f->f_op != &nvme_dev_fops) {
ctrl = ERR_PTR(-EINVAL);
goto out_close;
}
ctrl = f->private_data;
nvme_get_ctrl(ctrl);
out_close:
filp_close(f, NULL);
return ctrl;
}
EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
/*
* Check we didn't inadvertently grow the command structure sizes:
*/
@ -4364,6 +4625,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);

View File

@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *rq)
{
if (ctrl->state != NVME_CTRL_DELETING &&
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
ctrl->state != NVME_CTRL_DEAD &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;

View File

@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
if (likely(ctrl->state == NVME_CTRL_LIVE))
if (likely(ctrl->state == NVME_CTRL_LIVE ||
ctrl->state == NVME_CTRL_DELETING))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}

View File

@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
break;
case NVME_CTRL_DELETING:
case NVME_CTRL_DELETING_NOIO:
default:
/* no action to take - let it delete */
break;
@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments;
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
(ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);

View File

@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
int ret;
ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
&data->log, sizeof(data->log), 0);
NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
return ret <= 0 ? ret : -EIO;
}
@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
err = nvme_hwmon_get_smart_log(data);
if (err) {
dev_warn(dev, "Failed to read smart log (error %d)\n", err);
dev_warn(ctrl->device,
"Failed to read smart log (error %d)\n", err);
devm_kfree(dev, data);
return;
}

View File

@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
dev_meta_off = dev_meta;
ret = nvme_get_log(ctrl, ns->head->ns_id,
NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
offset);
NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM,
dev_meta, len, offset);
if (ret) {
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
break;

View File

@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
return ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
test_bit(NVME_NS_REMOVING, &ns->flags);
/*
* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
* still be able to complete assuming that the controller is connected.
* Otherwise it will fail immediately and return to the requeue list.
*/
if (ns->ctrl->state != NVME_CTRL_LIVE &&
ns->ctrl->state != NVME_CTRL_DELETING)
return true;
if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
test_bit(NVME_NS_REMOVING, &ns->flags))
return true;
return false;
}
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
fallback = ns;
}
/* No optimized path found, re-check the current path */
if (!nvme_path_is_disabled(old) &&
old->ana_state == NVME_ANA_OPTIMIZED) {
found = old;
goto out;
}
if (!fallback)
return NULL;
found = fallback;
@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
ns = nvme_round_robin_path(head, node, ns);
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head, node);
if (unlikely(!ns))
return __nvme_find_path(head, node);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
@ -527,7 +545,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
int error;
mutex_lock(&ctrl->ana_lock);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
if (ctrl->state != NVME_CTRL_LIVE)
return;
nvme_read_ana_log(ctrl);
}

View File

@ -37,6 +37,14 @@ extern unsigned int admin_timeout;
#define NVME_INLINE_METADATA_SG_CNT 1
#endif
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accommodate architectures with differing
* kernel and IO page sizes.
*/
#define NVME_CTRL_PAGE_SHIFT 12
#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT)
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
@ -180,12 +188,32 @@ static inline u16 nvme_req_qid(struct request *req)
*/
#define NVME_QUIRK_DELAY_AMOUNT 2300
/*
* enum nvme_ctrl_state: Controller state
*
* @NVME_CTRL_NEW: New controller just allocated, initial state
* @NVME_CTRL_LIVE: Controller is connected and I/O capable
* @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset)
* @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the
* transport
* @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion)
* @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not
* disabled/failed immediately. This state comes
* after all async event processing took place and
* before ns removal and the controller deletion
* progress
* @NVME_CTRL_DEAD: Controller is non-present/unresponsive during
* shutdown or removal. In this case we forcibly
* kill all inflight I/O as they have no chance to
* complete
*/
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
NVME_CTRL_DELETING_NOIO,
NVME_CTRL_DEAD,
};
@ -198,6 +226,12 @@ struct nvme_fault_inject {
#endif
};
struct nvme_cel {
struct list_head entry;
struct nvme_effects_log log;
u8 csi;
};
struct nvme_ctrl {
bool comp_seen;
enum nvme_ctrl_state state;
@ -235,10 +269,12 @@ struct nvme_ctrl {
u32 queue_count;
u64 cap;
u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
#ifdef CONFIG_BLK_DEV_ZONED
u32 max_zone_append;
#endif
u16 crdt[3];
u16 oncs;
u16 oacs;
@ -264,6 +300,7 @@ struct nvme_ctrl {
unsigned long quirks;
struct nvme_id_power_state psd[32];
struct nvme_effects_log *effects;
struct list_head cels;
struct work_struct scan_work;
struct work_struct async_event_work;
struct delayed_work ka_work;
@ -346,6 +383,7 @@ struct nvme_ns_ids {
u8 eui64[8];
u8 nguid[16];
uuid_t uuid;
u8 csi;
};
/*
@ -365,6 +403,7 @@ struct nvme_ns_head {
struct kref ref;
bool shared;
int instance;
struct nvme_effects_log *effects;
#ifdef CONFIG_NVME_MULTIPATH
struct gendisk *disk;
struct bio_list requeue_list;
@ -402,6 +441,9 @@ struct nvme_ns {
u16 sgs;
u32 sws;
u8 pi_type;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
#endif
unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
@ -567,8 +609,11 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset);
struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns_head **head, int *srcu_idx);
void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct block_device_operations nvme_ns_head_ops;
@ -704,6 +749,36 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_BLK_DEV_ZONED
int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
unsigned lbaf);
int nvme_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
#else
#define nvme_report_zones NULL
static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action)
{
return BLK_STS_NOTSUPP;
}
static inline int nvme_update_zone_info(struct gendisk *disk,
struct nvme_ns *ns,
unsigned lbaf)
{
dev_warn(ns->ctrl->device,
"Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
return -EPROTONOSUPPORT;
}
#endif
#ifdef CONFIG_NVM
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
void nvme_nvm_unregister(struct nvme_ns *ns);
@ -735,4 +810,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl);
static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode);
void nvme_execute_passthru_rq(struct request *rq);
struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
void nvme_put_ns(struct nvme_ns *ns);
#endif /* _NVME_H */

View File

@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
#include <linux/acpi.h>
#include <linux/aer.h>
#include <linux/async.h>
#include <linux/blkdev.h>
@ -61,10 +62,10 @@ MODULE_PARM_DESC(sgl_threshold,
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
.set = io_queue_depth_set,
.get = param_get_int,
.get = param_get_uint,
};
static int io_queue_depth = 1024;
static unsigned int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
@ -94,6 +95,10 @@ static unsigned int poll_queues;
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
struct nvme_dev;
struct nvme_queue;
@ -115,7 +120,7 @@ struct nvme_dev {
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
u16 q_depth;
int io_sqes;
u32 db_stride;
void __iomem *bar;
@ -151,13 +156,14 @@ struct nvme_dev {
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
int n = 0, ret;
int ret;
u16 n;
ret = kstrtoint(val, 10, &n);
ret = kstrtou16(val, 10, &n);
if (ret != 0 || n < 2)
return -EINVAL;
return param_set_int(val, kp);
return param_set_ushort(val, kp);
}
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@ -345,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
static int nvme_pci_npages_prp(void)
{
unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
dev->ctrl.page_size);
unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
@ -356,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
* Calculates the number of pages needed for the SGL segments. For example a 4k
* page can accommodate 256 SGL descriptors.
*/
static int nvme_pci_npages_sgl(unsigned int num_seg)
static int nvme_pci_npages_sgl(void)
{
return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
PAGE_SIZE);
}
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
unsigned int size, unsigned int nseg, bool use_sgl)
static size_t nvme_pci_iod_alloc_size(void)
{
size_t alloc_size;
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
if (use_sgl)
alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
else
alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
return alloc_size + sizeof(struct scatterlist) * nseg;
return sizeof(__le64 *) * npages +
sizeof(struct scatterlist) * NVME_MAX_SEGS;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@ -500,9 +502,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
int nseg = blk_rq_nr_phys_segments(req);
unsigned int avg_seg_size;
if (nseg == 0)
return false;
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
@ -517,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
@ -584,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
struct scatterlist *sg = iod->sg;
int dma_len = sg_dma_len(sg);
u64 dma_addr = sg_dma_address(sg);
u32 page_size = dev->ctrl.page_size;
int offset = dma_addr & (page_size - 1);
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
length -= (page_size - offset);
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
iod->first_dma = 0;
goto done;
}
dma_len -= (page_size - offset);
dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
if (dma_len) {
dma_addr += (page_size - offset);
dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
} else {
sg = sg_next(sg);
dma_addr = sg_dma_address(sg);
dma_len = sg_dma_len(sg);
}
if (length <= page_size) {
if (length <= NVME_CTRL_PAGE_SIZE) {
iod->first_dma = dma_addr;
goto done;
}
nprps = DIV_ROUND_UP(length, page_size);
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) {
pool = dev->prp_small_pool;
iod->npages = 0;
@ -630,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == page_size >> 3) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list)
@ -641,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
i = 1;
}
prp_list[i++] = cpu_to_le64(dma_addr);
dma_len -= page_size;
dma_addr += page_size;
length -= page_size;
dma_len -= NVME_CTRL_PAGE_SIZE;
dma_addr += NVME_CTRL_PAGE_SIZE;
length -= NVME_CTRL_PAGE_SIZE;
if (length <= 0)
break;
if (dma_len > 0)
@ -753,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
unsigned int first_prp_len = dev->ctrl.page_size - offset;
unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma))
@ -764,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
if (bv->bv_len > first_prp_len)
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
return 0;
return BLK_STS_OK;
}
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
@ -782,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
return 0;
return BLK_STS_OK;
}
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
@ -796,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
@ -846,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
if (dma_mapping_error(dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
return 0;
return BLK_STS_OK;
}
/*
@ -1019,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
static irqreturn_t nvme_irq_check(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
if (nvme_cqe_pending(nvmeq))
return IRQ_WAKE_THREAD;
return IRQ_NONE;
@ -1154,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t error)
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
/* If true, indicates loss of adapter communication, possibly by a
* NVMe Subsystem reset.
*/
@ -1261,9 +1259,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
}
/*
* Shutdown the controller immediately and schedule a reset if the
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
* Shutdown the controller immediately and schedule a reset if the
* command was already aborted once before and still hasn't been
* returned to the driver, or if this is the admin queue.
*/
if (!nvmeq->qid || iod->aborted) {
dev_warn(dev->ctrl.device,
@ -1398,11 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
{
int q_depth = dev->q_depth;
unsigned q_size_aligned = roundup(q_depth * entry_size,
dev->ctrl.page_size);
NVME_CTRL_PAGE_SIZE);
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
q_depth = div_u64(mem_per_q, entry_size);
/*
@ -1817,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
u64 dma_addr = dev->host_mem_descs_dma;
struct nvme_command c;
int ret;
@ -1825,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
c.features.dword11 = cpu_to_le32(bits);
c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
ilog2(dev->ctrl.page_size));
c.features.dword12 = cpu_to_le32(host_mem_size);
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
@ -1846,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
for (i = 0; i < dev->nr_host_mem_descs; i++) {
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
le64_to_cpu(desc->addr),
@ -1898,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
break;
descs[i].addr = cpu_to_le64(dma_addr);
descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
i++;
}
@ -1914,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
out_free_bufs:
while (--i >= 0) {
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, bufs[i],
le64_to_cpu(descs[i].addr),
@ -1932,12 +1931,12 @@ out:
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
u32 chunk_size;
u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
u64 chunk_size;
/* start big and work our way down */
for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
chunk_size /= 2) {
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
@ -2003,7 +2002,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
/*
* If there is no interupt available for queues, ensure that
* If there is no interrupt available for queues, ensure that
* the default queue is set to 1. The affinity set size is
* also set to one, but the irq core ignores it for this case.
*
@ -2261,8 +2260,8 @@ static void nvme_dev_add(struct nvme_dev *dev)
dev->tagset.nr_maps++;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev->ctrl.numa_node;
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = sizeof(struct nvme_iod);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
@ -2321,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
@ -2760,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
#ifdef CONFIG_ACPI
static bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
struct acpi_device *adev;
struct pci_dev *root;
acpi_handle handle;
acpi_status status;
u8 val;
/*
* Look for _DSD property specifying that the storage device on the port
* must use D3 to support deep platform power savings during
* suspend-to-idle.
*/
root = pcie_find_root_port(dev);
if (!root)
return false;
adev = ACPI_COMPANION(&root->dev);
if (!adev)
return false;
/*
* The property is defined in the PXSX device for South complex ports
* and in the PEGP device for North complex ports.
*/
status = acpi_get_handle(adev->handle, "PXSX", &handle);
if (ACPI_FAILURE(status)) {
status = acpi_get_handle(adev->handle, "PEGP", &handle);
if (ACPI_FAILURE(status))
return false;
}
if (acpi_bus_get_device(handle, &adev))
return false;
if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
&val))
return false;
return val == 1;
}
#else
static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
return false;
}
#endif /* CONFIG_ACPI */
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
struct nvme_dev *dev = data;
@ -2809,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
quirks |= check_vendor_combination_bug(pdev);
if (!noacpi && nvme_acpi_storage_d3(pdev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
*/
dev_info(&pdev->dev,
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
*/
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
NVME_MAX_SEGS, true);
alloc_size = nvme_pci_iod_alloc_size();
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
@ -2876,6 +2932,7 @@ static void nvme_reset_done(struct pci_dev *pdev)
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_disable_prepare_reset(dev, true);
}
@ -3006,6 +3063,7 @@ unfreeze:
static int nvme_simple_suspend(struct device *dev)
{
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
return nvme_disable_prepare_reset(ndev, true);
}
@ -3079,16 +3137,16 @@ static const struct pci_error_handlers nvme_err_handler = {
};
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
{ PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a53),
{ PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54),
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a55),
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
.driver_data = NVME_QUIRK_STRIPE_SIZE |
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */

View File

@ -96,6 +96,7 @@ struct nvme_rdma_queue {
int cm_error;
struct completion cm_done;
bool pi_support;
int cq_size;
};
struct nvme_rdma_ctrl {
@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
init_attr.recv_cq = queue->ib_cq;
if (queue->pi_support)
init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
init_attr.qp_context = queue;
ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
@ -409,6 +411,14 @@ out_err:
return NULL;
}
static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
{
if (nvme_rdma_poll_queue(queue))
ib_free_cq(queue->ib_cq);
else
ib_cq_pool_put(queue->ib_cq, queue->cq_size);
}
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
struct nvme_rdma_device *dev;
@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
* the destruction of the QP shouldn't use rdma_cm API.
*/
ib_destroy_qp(queue->qp);
ib_free_cq(queue->ib_cq);
nvme_rdma_free_cq(queue);
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
sizeof(struct nvme_completion), DMA_FROM_DEVICE);
@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
}
static int nvme_rdma_create_cq(struct ib_device *ibdev,
struct nvme_rdma_queue *queue)
{
int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
/*
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue)) {
poll_ctx = IB_POLL_DIRECT;
queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
comp_vector, poll_ctx);
} else {
poll_ctx = IB_POLL_SOFTIRQ;
queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
comp_vector, poll_ctx);
}
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
return ret;
}
return 0;
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
{
struct ib_device *ibdev;
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
}
ibdev = queue->device->dev;
/*
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue))
poll_ctx = IB_POLL_DIRECT;
else
poll_ctx = IB_POLL_SOFTIRQ;
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
cq_factor * queue->queue_size + 1,
comp_vector, poll_ctx);
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
queue->cq_size = cq_factor * queue->queue_size + 1;
ret = nvme_rdma_create_cq(ibdev, queue);
if (ret)
goto out_put_dev;
}
ret = nvme_rdma_create_qp(queue, send_wr_factor);
if (ret)
@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
queue->queue_size, idx);
queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_ring;
}
@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize PI MR pool sized %d for QID %d\n",
queue->queue_size, idx);
queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_mr_pool;
}
}
@ -540,7 +565,7 @@ out_destroy_ring:
out_destroy_qp:
rdma_destroy_qp(queue->cm_id);
out_destroy_ib_cq:
ib_free_cq(queue->ib_cq);
nvme_rdma_free_cq(queue);
out_put_dev:
nvme_rdma_dev_put(queue->device);
return ret;
@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tag_set;
}
} else {
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
}
ret = nvme_rdma_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_start_queues(&ctrl->ctrl);
nvme_wait_freeze(&ctrl->ctrl);
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
ctrl->ctrl.queue_count - 1);
nvme_unfreeze(&ctrl->ctrl);
}
return 0;
out_cleanup_connect_q:
@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
if (ctrl->ctrl.tagset) {
@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
if (!changed) {
/*
* state change failure is ok if we're in DELETING state,
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
return;
}
@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
const char *op)
{
struct nvme_rdma_queue *queue = cq->cq_context;
struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
struct nvme_rdma_queue *queue = cq->cq_context;
struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);

View File

@ -46,6 +46,7 @@ struct nvme_tcp_request {
u32 pdu_sent;
u16 ttag;
struct list_head entry;
struct llist_node lentry;
__le32 ddgst;
struct bio *curr_bio;
@ -75,9 +76,10 @@ struct nvme_tcp_queue {
struct work_struct io_work;
int io_cpu;
spinlock_t lock;
struct mutex send_mutex;
struct llist_head req_list;
struct list_head send_list;
bool more_requests;
/* recv state */
void *pdu;
@ -261,15 +263,13 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync)
bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
spin_lock(&queue->lock);
empty = list_empty(&queue->send_list) && !queue->request;
list_add_tail(&req->entry, &queue->send_list);
spin_unlock(&queue->lock);
empty = llist_add(&req->lentry, &queue->req_list) &&
list_empty(&queue->send_list) && !queue->request;
/*
* if we're the first on the send_list and we can try to send
@ -278,25 +278,42 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
*/
if (queue->io_cpu == smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
queue->more_requests = !last;
nvme_tcp_try_send(queue);
queue->more_requests = false;
mutex_unlock(&queue->send_mutex);
} else {
} else if (last) {
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
}
static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
struct llist_node *node;
for (node = llist_del_all(&queue->req_list); node; node = node->next) {
req = llist_entry(node, struct nvme_tcp_request, lentry);
list_add(&req->entry, &queue->send_list);
}
}
static inline struct nvme_tcp_request *
nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_request *req;
spin_lock(&queue->lock);
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
if (req)
list_del(&req->entry);
spin_unlock(&queue->lock);
if (!req) {
nvme_tcp_process_req_list(queue);
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
if (unlikely(!req))
return NULL;
}
list_del(&req->entry);
return req;
}
@ -596,7 +613,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
nvme_tcp_queue_request(req, false);
nvme_tcp_queue_request(req, false, true);
return 0;
}
@ -863,6 +880,12 @@ done:
read_unlock(&sk->sk_callback_lock);
}
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
{
return !list_empty(&queue->send_list) ||
!llist_empty(&queue->req_list) || queue->more_requests;
}
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
{
queue->request = NULL;
@ -884,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
bool last = nvme_tcp_pdu_last_send(req, len);
int ret, flags = MSG_DONTWAIT;
if (last && !queue->data_digest)
if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
flags |= MSG_EOR;
else
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
@ -931,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
int flags = MSG_DONTWAIT;
int ret;
if (inline_data)
if (inline_data || nvme_tcp_queue_more(queue))
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
@ -996,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
int ret;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
.iov_base = &req->ddgst + req->offset,
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
};
if (nvme_tcp_queue_more(queue))
msg.msg_flags |= MSG_MORE;
else
msg.msg_flags |= MSG_EOR;
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (unlikely(ret <= 0))
return ret;
@ -1344,8 +1372,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
int ret, rcv_pdu_size;
queue->ctrl = ctrl;
init_llist_head(&queue->req_list);
INIT_LIST_HEAD(&queue->send_list);
spin_lock_init(&queue->lock);
mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@ -1746,15 +1774,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
} else {
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
}
ret = nvme_tcp_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_start_queues(ctrl);
nvme_wait_freeze(ctrl);
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
nvme_unfreeze(ctrl);
}
return 0;
out_cleanup_connect_q:
@ -1859,6 +1892,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
if (ctrl->tagset) {
@ -1925,11 +1959,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
/*
* state change failure is ok if we're in DELETING state,
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
@ -1985,8 +2020,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@ -2021,8 +2057,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@ -2109,7 +2146,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
nvme_tcp_queue_request(&ctrl->async_req, true);
nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
static enum blk_eh_timer_return
@ -2221,6 +2258,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
return 0;
}
static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
if (!llist_empty(&queue->req_list))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@ -2240,7 +2285,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
nvme_tcp_queue_request(req, true);
nvme_tcp_queue_request(req, true, bd->last);
return BLK_STS_OK;
}
@ -2308,6 +2353,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
.exit_request = nvme_tcp_exit_request,

256
drivers/nvme/host/zns.c Normal file
View File

@ -0,0 +1,256 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2020 Western Digital Corporation or its affiliates.
*/
#include <linux/blkdev.h>
#include <linux/vmalloc.h>
#include "nvme.h"
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
struct nvme_id_ctrl_zns *id;
int status;
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CS_CTRL;
c.identify.csi = NVME_CSI_ZNS;
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
if (status) {
kfree(id);
return status;
}
if (id->zasl)
ctrl->max_zone_append = 1 << (id->zasl + 3);
else
ctrl->max_zone_append = ctrl->max_hw_sectors;
kfree(id);
return 0;
}
int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
unsigned lbaf)
{
struct nvme_effects_log *log = ns->head->effects;
struct request_queue *q = disk->queue;
struct nvme_command c = { };
struct nvme_id_ns_zns *id;
int status;
/* Driver requires zone append support */
if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
NVME_CMD_EFFECTS_CSUPP)) {
dev_warn(ns->ctrl->device,
"append not supported for zoned namespace:%d\n",
ns->head->ns_id);
return -EINVAL;
}
/* Lazily query controller append limit for the first zoned namespace */
if (!ns->ctrl->max_zone_append) {
status = nvme_set_max_append(ns->ctrl);
if (status)
return status;
}
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(ns->head->ns_id);
c.identify.cns = NVME_ID_CNS_CS_NS;
c.identify.csi = NVME_CSI_ZNS;
status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
if (status)
goto free_data;
/*
* We currently do not handle devices requiring any of the zoned
* operation characteristics.
*/
if (id->zoc) {
dev_warn(ns->ctrl->device,
"zone operations:%x not supported for namespace:%u\n",
le16_to_cpu(id->zoc), ns->head->ns_id);
status = -EINVAL;
goto free_data;
}
ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
if (!is_power_of_2(ns->zsze)) {
dev_warn(ns->ctrl->device,
"invalid zone size:%llu for namespace:%u\n",
ns->zsze, ns->head->ns_id);
status = -EINVAL;
goto free_data;
}
q->limits.zoned = BLK_ZONED_HM;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
free_data:
kfree(id);
return status;
}
static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
unsigned int nr_zones, size_t *buflen)
{
struct request_queue *q = ns->disk->queue;
size_t bufsize;
void *buf;
const size_t min_bufsize = sizeof(struct nvme_zone_report) +
sizeof(struct nvme_zone_descriptor);
nr_zones = min_t(unsigned int, nr_zones,
get_capacity(ns->disk) >> ilog2(ns->zsze));
bufsize = sizeof(struct nvme_zone_report) +
nr_zones * sizeof(struct nvme_zone_descriptor);
bufsize = min_t(size_t, bufsize,
queue_max_hw_sectors(q) << SECTOR_SHIFT);
bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
while (bufsize >= min_bufsize) {
buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
if (buf) {
*buflen = bufsize;
return buf;
}
bufsize >>= 1;
}
return NULL;
}
static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
struct nvme_zone_report *report,
size_t buflen)
{
struct nvme_command c = { };
int ret;
c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
c.zmr.zra = NVME_ZRA_ZONE_REPORT;
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
if (ret)
return ret;
return le64_to_cpu(report->nr_zones);
}
static int nvme_zone_parse_entry(struct nvme_ns *ns,
struct nvme_zone_descriptor *entry,
unsigned int idx, report_zones_cb cb,
void *data)
{
struct blk_zone zone = { };
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
dev_err(ns->ctrl->device, "invalid zone type %#x\n",
entry->zt);
return -EINVAL;
}
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone.cond = entry->zs >> 4;
zone.len = ns->zsze;
zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
return cb(&zone, idx, data);
}
static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
struct nvme_zone_report *report;
int ret, zone_idx = 0;
unsigned int nz, i;
size_t buflen;
report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
if (!report)
return -ENOMEM;
sector &= ~(ns->zsze - 1);
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
memset(report, 0, buflen);
ret = __nvme_ns_report_zones(ns, sector, report, buflen);
if (ret < 0)
goto out_free;
nz = min_t(unsigned int, ret, nr_zones);
if (!nz)
break;
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
ret = nvme_zone_parse_entry(ns, &report->entries[i],
zone_idx, cb, data);
if (ret)
goto out_free;
zone_idx++;
}
sector += ns->zsze * nz;
}
if (zone_idx > 0)
ret = zone_idx;
else
ret = -EINVAL;
out_free:
kvfree(report);
return ret;
}
int nvme_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
struct nvme_ns_head *head = NULL;
struct nvme_ns *ns;
int srcu_idx, ret;
ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
if (unlikely(!ns))
return -EWOULDBLOCK;
if (ns->head->ids.csi == NVME_CSI_ZNS)
ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
else
ret = -EINVAL;
nvme_put_ns_from_disk(head, srcu_idx);
return ret;
}
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *c, enum nvme_zone_mgmt_action action)
{
c->zms.opcode = nvme_cmd_zone_mgmt_send;
c->zms.nsid = cpu_to_le32(ns->head->ns_id);
c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
c->zms.zsa = action;
if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
c->zms.select_all = 1;
return BLK_STS_OK;
}

View File

@ -16,6 +16,18 @@ config NVME_TARGET
To configure the NVMe target you probably want to use the nvmetcli
tool from http://git.infradead.org/users/hch/nvmetcli.git.
config NVME_TARGET_PASSTHRU
bool "NVMe Target Passthrough support"
depends on NVME_TARGET
depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
help
This enables target side NVMe passthru controller support for the
NVMe Over Fabrics protocol. It allows for hosts to manage and
directly access an actual NVMe controller residing on the target
side, incuding executing Vendor Unique Commands.
If unsure, say N.
config NVME_TARGET_LOOP
tristate "NVMe loopback device support"
depends on NVME_TARGET

View File

@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o

View File

@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
u64 data_units_read = 0, data_units_written = 0;
struct nvmet_ns *ns;
struct nvmet_ctrl *ctrl;
unsigned long idx;
ctrl = req->sq->ctrl;
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
/* we don't have the right data for file backed ns */
if (!ns->bdev)
continue;
@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
data_units_written += DIV_ROUND_UP(
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
}
rcu_read_unlock();
put_unaligned_le64(host_reads, &slog->host_reads[0]);
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
unsigned long idx;
u32 count = 0;
if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
if (ns->anagrpid == grpid)
desc->nsids[count++] = cpu_to_le32(ns->nsid);
rcu_read_unlock();
}
desc->grpid = cpu_to_le32(grpid);
@ -427,7 +423,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->awupf = 0;
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
id->sgls |= cpu_to_le32(1 << 2);
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
unsigned long idx;
u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
__le32 *list;
u16 status = 0;
@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
goto out;
}
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
if (ns->nsid <= min_nsid)
continue;
list[i++] = cpu_to_le32(ns->nsid);
if (i == buf_size / sizeof(__le32))
break;
}
rcu_read_unlock();
status = nvmet_copy_to_sgl(req, 0, list, buf_size);
@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
return 0;
}
static void nvmet_execute_set_features(struct nvmet_req *req)
void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
}
static void nvmet_execute_get_features(struct nvmet_req *req)
void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
if (nvmet_req_passthru_ctrl(req))
return nvmet_parse_passthru_admin_cmd(req);
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->execute = nvmet_execute_get_log_page;

View File

@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = {
.ct_owner = THIS_MODULE,
};
#ifdef CONFIG_NVME_TARGET_PASSTHRU
static ssize_t nvmet_passthru_device_path_show(struct config_item *item,
char *page)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path);
}
static ssize_t nvmet_passthru_device_path_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
size_t len;
int ret;
mutex_lock(&subsys->lock);
ret = -EBUSY;
if (subsys->passthru_ctrl)
goto out_unlock;
ret = -EINVAL;
len = strcspn(page, "\n");
if (!len)
goto out_unlock;
kfree(subsys->passthru_ctrl_path);
ret = -ENOMEM;
subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL);
if (!subsys->passthru_ctrl_path)
goto out_unlock;
mutex_unlock(&subsys->lock);
return count;
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
}
CONFIGFS_ATTR(nvmet_passthru_, device_path);
static ssize_t nvmet_passthru_enable_show(struct config_item *item,
char *page)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0);
}
static ssize_t nvmet_passthru_enable_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
bool enable;
int ret = 0;
if (strtobool(page, &enable))
return -EINVAL;
if (enable)
ret = nvmet_passthru_ctrl_enable(subsys);
else
nvmet_passthru_ctrl_disable(subsys);
return ret ? ret : count;
}
CONFIGFS_ATTR(nvmet_passthru_, enable);
static struct configfs_attribute *nvmet_passthru_attrs[] = {
&nvmet_passthru_attr_device_path,
&nvmet_passthru_attr_enable,
NULL,
};
static const struct config_item_type nvmet_passthru_type = {
.ct_attrs = nvmet_passthru_attrs,
.ct_owner = THIS_MODULE,
};
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
{
config_group_init_type_name(&subsys->passthru_group,
"passthru", &nvmet_passthru_type);
configfs_add_default_group(&subsys->passthru_group,
&subsys->group);
}
#else /* CONFIG_NVME_TARGET_PASSTHRU */
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
{
}
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
static int nvmet_port_subsys_allow_link(struct config_item *parent,
struct config_item *target)
{
@ -862,14 +959,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item,
struct nvmet_subsys *subsys = to_subsys(item);
if (NVME_TERTIARY(subsys->ver))
return snprintf(page, PAGE_SIZE, "%d.%d.%d\n",
(int)NVME_MAJOR(subsys->ver),
(int)NVME_MINOR(subsys->ver),
(int)NVME_TERTIARY(subsys->ver));
return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n",
NVME_MAJOR(subsys->ver),
NVME_MINOR(subsys->ver),
NVME_TERTIARY(subsys->ver));
return snprintf(page, PAGE_SIZE, "%d.%d\n",
(int)NVME_MAJOR(subsys->ver),
(int)NVME_MINOR(subsys->ver));
return snprintf(page, PAGE_SIZE, "%llu.%llu\n",
NVME_MAJOR(subsys->ver),
NVME_MINOR(subsys->ver));
}
static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
int major, minor, tertiary = 0;
int ret;
/* passthru subsystems use the underlying controller's version */
if (nvmet_passthru_ctrl(subsys))
return -EINVAL;
ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
if (ret != 2 && ret != 3)
return -EINVAL;
@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
configfs_add_default_group(&subsys->allowed_hosts_group,
&subsys->group);
nvmet_add_passthru_group(subsys);
return &subsys->group;
}

View File

@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
{
struct nvmet_ns *ns;
unsigned long nsid = 0;
struct nvmet_ns *cur;
unsigned long idx;
if (list_empty(&subsys->namespaces))
return 0;
xa_for_each(&subsys->namespaces, idx, cur)
nsid = cur->nsid;
ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
return ns->nsid;
return nsid;
}
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
@ -336,7 +337,7 @@ int nvmet_enable_port(struct nvmet_port *port)
* If the user requested PI support and the transport isn't pi capable,
* don't enable the port.
*/
if (port->pi_enable && !ops->metadata_support) {
if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
pr_err("T10-PI is not supported by transport type %d\n",
port->disc_addr.trtype);
ret = -EINVAL;
@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
cancel_delayed_work_sync(&ctrl->ka_work);
}
static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
__le32 nsid)
{
struct nvmet_ns *ns;
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
if (ns->nsid == le32_to_cpu(nsid))
return ns;
}
return NULL;
}
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
{
struct nvmet_ns *ns;
rcu_read_lock();
ns = __nvmet_find_namespace(ctrl, nsid);
ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
if (ns)
percpu_ref_get(&ns->ref);
rcu_read_unlock();
return ns;
}
@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
mutex_lock(&subsys->lock);
ret = 0;
if (nvmet_passthru_ctrl(subsys)) {
pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
goto out_unlock;
}
if (ns->enabled)
goto out_unlock;
@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = ns->nsid;
/*
* The namespaces list needs to be sorted to simplify the implementation
* of the Identify Namepace List subcommand.
*/
if (list_empty(&subsys->namespaces)) {
list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
} else {
struct nvmet_ns *old;
ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
if (ret)
goto out_restore_subsys_maxnsid;
list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
lockdep_is_held(&subsys->lock)) {
BUG_ON(ns->nsid == old->nsid);
if (ns->nsid < old->nsid)
break;
}
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
out_restore_subsys_maxnsid:
subsys->max_nsid = nvmet_max_nsid(subsys);
percpu_ref_exit(&ns->ref);
out_dev_put:
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
goto out_unlock;
ns->enabled = false;
list_del_rcu(&ns->dev_link);
xa_erase(&ns->subsys->namespaces, ns->nsid);
if (ns->nsid == subsys->max_nsid)
subsys->max_nsid = nvmet_max_nsid(subsys);
@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
if (!ns)
return NULL;
INIT_LIST_HEAD(&ns->dev_link);
init_completion(&ns->disable_done);
ns->nsid = nsid;
@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
if (nvmet_req_passthru_ctrl(req))
return nvmet_parse_passthru_io_cmd(req);
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
struct nvmet_req *req)
{
struct nvmet_ns *ns;
unsigned long idx;
if (!req->p2p_client)
return;
ctrl->p2p_client = get_device(req->p2p_client);
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
lockdep_is_held(&ctrl->subsys->lock))
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
if (!subsys)
return ERR_PTR(-ENOMEM);
subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
subsys->ver = NVMET_DEFAULT_VS;
/* generate a random serial number as our controllers are ephemeral: */
get_random_bytes(&subsys->serial, sizeof(subsys->serial));
@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
kref_init(&subsys->ref);
mutex_init(&subsys->lock);
INIT_LIST_HEAD(&subsys->namespaces);
xa_init(&subsys->namespaces);
INIT_LIST_HEAD(&subsys->ctrls);
INIT_LIST_HEAD(&subsys->hosts);
@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref)
struct nvmet_subsys *subsys =
container_of(ref, struct nvmet_subsys, ref);
WARN_ON_ONCE(!list_empty(&subsys->namespaces));
WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
xa_destroy(&subsys->namespaces);
nvmet_passthru_subsys_free(subsys);
kfree(subsys->subsysnqn);
kfree_rcu(subsys->model, rcuhead);

View File

@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->has_keyed_sgls)
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
id->sgls |= cpu_to_le32(1 << 2);
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);

View File

@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc {
struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
struct kref ref;
struct work_struct del_work;
atomic_t del_work_active;
};
@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work)
container_of(work, struct nvmet_fc_tgt_assoc, del_work);
nvmet_fc_delete_target_assoc(assoc);
atomic_set(&assoc->del_work_active, 0);
nvmet_fc_tgt_a_put(assoc);
}
@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle)
INIT_LIST_HEAD(&assoc->a_list);
kref_init(&assoc->ref);
INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
atomic_set(&assoc->del_work_active, 0);
atomic_set(&assoc->terminating, 0);
while (needrandom) {
@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
if (association_id == assoc->association_id) {
ret = assoc;
nvmet_fc_tgt_a_get(assoc);
if (!nvmet_fc_tgt_a_get(assoc))
ret = NULL;
break;
}
}
@ -1477,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
{
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
&tgtport->assoc_list, a_list) {
if (!nvmet_fc_tgt_a_get(assoc))
continue;
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
}
spin_unlock_irqrestore(&tgtport->lock, flags);
}
@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
bool noassoc = true;
int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
@ -1545,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
continue;
assoc->hostport->invalid = 1;
noassoc = false;
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
}
spin_unlock_irqrestore(&tgtport->lock, flags);
@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
struct nvmet_fc_tgt_queue *queue;
unsigned long flags;
bool found_ctrl = false;
int ret;
/* this is a bit ugly, but don't want to make locks layered */
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
nvmet_fc_tgtport_put(tgtport);
if (found_ctrl) {
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
if (!schedule_work(&assoc->del_work))
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
return;
}

View File

@ -43,6 +43,17 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_ERR, NULL }
};
static int fcloop_verify_addr(substring_t *s)
{
size_t blen = s->to - s->from + 1;
if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 ||
strncmp(s->from, "0x", 2))
return -EINVAL;
return 0;
}
static int
fcloop_parse_options(struct fcloop_ctrl_options *opts,
const char *buf)
@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->mask |= token;
switch (token) {
case NVMF_OPT_WWNN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
opts->wwnn = token64;
break;
case NVMF_OPT_WWPN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->fcaddr = token;
break;
case NVMF_OPT_LPWWNN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
opts->lpwwnn = token64;
break;
case NVMF_OPT_LPWWPN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname,
token = match_token(p, opt_tokens, args);
switch (token) {
case NVMF_OPT_WWNN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}
*nname = token64;
break;
case NVMF_OPT_WWPN:
if (match_u64(args, &token64)) {
if (fcloop_verify_addr(args) ||
match_u64(args, &token64)) {
ret = -EINVAL;
goto out_free_options;
}

View File

@ -36,7 +36,6 @@ struct nvme_loop_ctrl {
struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
struct nvmet_ctrl *target_ctrl;
struct nvmet_port *port;
};
@ -445,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
{
struct nvme_loop_ctrl *ctrl =
container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
bool changed;
int ret;
nvme_stop_ctrl(&ctrl->ctrl);
@ -472,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
WARN_ON_ONCE(1);
nvme_start_ctrl(&ctrl->ctrl);
@ -568,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_loop_ctrl *ctrl;
bool changed;
int ret;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
@ -584,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
if (ret)
goto out_put_ctrl;
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
WARN_ON_ONCE(1);
ret = -ENOMEM;
ctrl->ctrl.sqsize = opts->queue_size - 1;
@ -618,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
WARN_ON_ONCE(1);
mutex_lock(&nvme_loop_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);

View File

@ -21,6 +21,8 @@
#include <linux/radix-tree.h>
#include <linux/t10-pi.h>
#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
#define NVMET_NO_ERROR_LOC ((u16)-1)
@ -52,7 +54,6 @@
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
struct nvmet_ns {
struct list_head dev_link;
struct percpu_ref ref;
struct block_device *bdev;
struct file *file;
@ -219,7 +220,7 @@ struct nvmet_subsys {
struct mutex lock;
struct kref ref;
struct list_head namespaces;
struct xarray namespaces;
unsigned int nr_namespaces;
unsigned int max_nsid;
u16 cntlid_min;
@ -243,6 +244,12 @@ struct nvmet_subsys {
struct config_group allowed_hosts_group;
struct nvmet_subsys_model __rcu *model;
#ifdef CONFIG_NVME_TARGET_PASSTHRU
struct nvme_ctrl *passthru_ctrl;
char *passthru_ctrl_path;
struct config_group passthru_group;
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
};
static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@ -286,8 +293,9 @@ struct nvmet_fabrics_ops {
struct module *owner;
unsigned int type;
unsigned int msdbd;
bool has_keyed_sgls : 1;
bool metadata_support : 1;
unsigned int flags;
#define NVMF_KEYED_SGLS (1 << 0)
#define NVMF_METADATA_SUPPORTED (1 << 1)
void (*queue_response)(struct nvmet_req *req);
int (*add_port)(struct nvmet_port *port);
void (*remove_port)(struct nvmet_port *port);
@ -321,6 +329,11 @@ struct nvmet_req {
struct bio_vec *bvec;
struct work_struct work;
} f;
struct {
struct request *rq;
struct work_struct work;
bool use_workqueue;
} p;
};
int sg_cnt;
int metadata_sg_cnt;
@ -400,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
int nvmet_req_alloc_sgls(struct nvmet_req *req);
void nvmet_req_free_sgls(struct nvmet_req *req);
void nvmet_execute_set_features(struct nvmet_req *req);
void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@ -532,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
sizeof(struct nvme_dsm_range);
}
#ifdef CONFIG_NVME_TARGET_PASSTHRU
void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys);
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
{
return subsys->passthru_ctrl;
}
#else /* CONFIG_NVME_TARGET_PASSTHRU */
static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
{
}
static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
{
}
static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
{
return 0;
}
static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
{
return 0;
}
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
{
return NULL;
}
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
static inline struct nvme_ctrl *
nvmet_req_passthru_ctrl(struct nvmet_req *req)
{
return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
}
u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
/* Convert a 32-bit number to a 16-bit 0's based number */

View File

@ -0,0 +1,544 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NVMe Over Fabrics Target Passthrough command implementation.
*
* Copyright (c) 2017-2018 Western Digital Corporation or its
* affiliates.
* Copyright (c) 2019-2020, Eideticom Inc.
*
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include "../host/nvme.h"
#include "nvmet.h"
MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU);
/*
* xarray to maintain one passthru subsystem per nvme controller.
*/
static DEFINE_XARRAY(passthru_subsystems);
static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
u16 status = NVME_SC_SUCCESS;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
int page_shift;
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return NVME_SC_INTERNAL;
status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id));
if (status)
goto out_free;
id->cntlid = cpu_to_le16(ctrl->cntlid);
id->ver = cpu_to_le32(ctrl->subsys->ver);
/*
* The passthru NVMe driver may have a limit on the number of segments
* which depends on the host's memory fragementation. To solve this,
* ensure mdts is limited to the pages equal to the number of segments.
*/
max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9),
pctrl->max_hw_sectors);
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
id->mdts = ilog2(max_hw_sectors) + 9 - page_shift;
id->acl = 3;
/*
* We export aerl limit for the fabrics controller, update this when
* passthru based aerl support is added.
*/
id->aerl = NVMET_ASYNC_EVENTS - 1;
/* emulate kas as most of the PCIe ctrl don't have a support for kas */
id->kas = cpu_to_le16(NVMET_KAS);
/* don't support host memory buffer */
id->hmpre = 0;
id->hmmin = 0;
id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes);
id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes);
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
/* don't support fuse commands */
id->fuses = 0;
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
id->sgls |= cpu_to_le32(1 << 2);
if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20);
/*
* When passsthru controller is setup using nvme-loop transport it will
* export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in
* the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl()
* code path with duplicate ctr subsynqn. In order to prevent that we
* mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn.
*/
memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn));
/* use fabric id-ctrl values */
id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
req->port->inline_data_size) / 16);
id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
id->msdbd = ctrl->ops->msdbd;
/* Support multipath connections with fabrics */
id->cmic |= 1 << 1;
/* Disable reservations, see nvmet_parse_passthru_io_cmd() */
id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS);
status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl));
out_free:
kfree(id);
return status;
}
static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req)
{
u16 status = NVME_SC_SUCCESS;
struct nvme_id_ns *id;
int i;
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
return NVME_SC_INTERNAL;
status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns));
if (status)
goto out_free;
for (i = 0; i < (id->nlbaf + 1); i++)
if (id->lbaf[i].ms)
memset(&id->lbaf[i], 0, sizeof(id->lbaf[i]));
id->flbas = id->flbas & ~(1 << 4);
/*
* Presently the NVMEof target code does not support sending
* metadata, so we must disable it here. This should be updated
* once target starts supporting metadata.
*/
id->mc = 0;
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
out_free:
kfree(id);
return status;
}
static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
{
struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
struct request *rq = req->p.rq;
u16 status;
nvme_execute_passthru_rq(rq);
status = nvme_req(rq)->status;
if (status == NVME_SC_SUCCESS &&
req->cmd->common.opcode == nvme_admin_identify) {
switch (req->cmd->identify.cns) {
case NVME_ID_CNS_CTRL:
nvmet_passthru_override_id_ctrl(req);
break;
case NVME_ID_CNS_NS:
nvmet_passthru_override_id_ns(req);
break;
}
}
req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, status);
blk_put_request(rq);
}
static void nvmet_passthru_req_done(struct request *rq,
blk_status_t blk_status)
{
struct nvmet_req *req = rq->end_io_data;
req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, nvme_req(rq)->status);
blk_put_request(rq);
}
static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
{
int sg_cnt = req->sg_cnt;
struct scatterlist *sg;
int op_flags = 0;
struct bio *bio;
int i, ret;
if (req->cmd->common.opcode == nvme_cmd_flush)
op_flags = REQ_FUA;
else if (nvme_is_write(req->cmd))
op_flags = REQ_SYNC | REQ_IDLE;
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
bio->bi_end_io = bio_put;
bio->bi_opf = req_op(rq) | op_flags;
for_each_sg(req->sg, sg, req->sg_cnt, i) {
if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
sg->offset) < sg->length) {
bio_put(bio);
return -EINVAL;
}
sg_cnt--;
}
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
bio_put(bio);
return ret;
}
return 0;
}
static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
{
struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
struct request_queue *q = ctrl->admin_q;
struct nvme_ns *ns = NULL;
struct request *rq = NULL;
u32 effects;
u16 status;
int ret;
if (likely(req->sq->qid != 0)) {
u32 nsid = le32_to_cpu(req->cmd->common.nsid);
ns = nvme_find_get_ns(ctrl, nsid);
if (unlikely(!ns)) {
pr_err("failed to get passthru ns nsid:%u\n", nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto fail_out;
}
q = ns->queue;
}
rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(rq)) {
rq = NULL;
status = NVME_SC_INTERNAL;
goto fail_out;
}
if (req->sg_cnt) {
ret = nvmet_passthru_map_sg(req, rq);
if (unlikely(ret)) {
status = NVME_SC_INTERNAL;
goto fail_out;
}
}
/*
* If there are effects for the command we are about to execute, or
* an end_req function we need to use nvme_execute_passthru_rq()
* synchronously in a work item seeing the end_req function and
* nvme_passthru_end() can't be called in the request done callback
* which is typically in interrupt context.
*/
effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode);
if (req->p.use_workqueue || effects) {
INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work);
req->p.rq = rq;
schedule_work(&req->p.work);
} else {
rq->end_io_data = req;
blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
nvmet_passthru_req_done);
}
if (ns)
nvme_put_ns(ns);
return;
fail_out:
if (ns)
nvme_put_ns(ns);
nvmet_req_complete(req, status);
blk_put_request(rq);
}
/*
* We need to emulate set host behaviour to ensure that any requested
* behaviour of the target's host matches the requested behaviour
* of the device's host and fail otherwise.
*/
static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req)
{
struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
struct nvme_feat_host_behavior *host;
u16 status = NVME_SC_INTERNAL;
int ret;
host = kzalloc(sizeof(*host) * 2, GFP_KERNEL);
if (!host)
goto out_complete_req;
ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
host, sizeof(*host), NULL);
if (ret)
goto out_free_host;
status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host));
if (status)
goto out_free_host;
if (memcmp(&host[0], &host[1], sizeof(host[0]))) {
pr_warn("target host has requested different behaviour from the local host\n");
status = NVME_SC_INTERNAL;
}
out_free_host:
kfree(host);
out_complete_req:
nvmet_req_complete(req, status);
}
static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
{
req->p.use_workqueue = false;
req->execute = nvmet_passthru_execute_cmd;
return NVME_SC_SUCCESS;
}
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
{
switch (req->cmd->common.opcode) {
case nvme_cmd_resv_register:
case nvme_cmd_resv_report:
case nvme_cmd_resv_acquire:
case nvme_cmd_resv_release:
/*
* Reservations cannot be supported properly because the
* underlying device has no way of differentiating different
* hosts that connect via fabrics. This could potentially be
* emulated in the future if regular targets grow support for
* this feature.
*/
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
return nvmet_setup_passthru_command(req);
}
/*
* Only features that are emulated or specifically allowed in the list are
* passed down to the controller. This function implements the allow list for
* both get and set features.
*/
static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
{
switch (le32_to_cpu(req->cmd->features.fid)) {
case NVME_FEAT_ARBITRATION:
case NVME_FEAT_POWER_MGMT:
case NVME_FEAT_LBA_RANGE:
case NVME_FEAT_TEMP_THRESH:
case NVME_FEAT_ERR_RECOVERY:
case NVME_FEAT_VOLATILE_WC:
case NVME_FEAT_WRITE_ATOMIC:
case NVME_FEAT_AUTO_PST:
case NVME_FEAT_TIMESTAMP:
case NVME_FEAT_HCTM:
case NVME_FEAT_NOPSC:
case NVME_FEAT_RRL:
case NVME_FEAT_PLM_CONFIG:
case NVME_FEAT_PLM_WINDOW:
case NVME_FEAT_HOST_BEHAVIOR:
case NVME_FEAT_SANITIZE:
case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END:
return nvmet_setup_passthru_command(req);
case NVME_FEAT_ASYNC_EVENT:
/* There is no support for forwarding ASYNC events */
case NVME_FEAT_IRQ_COALESCE:
case NVME_FEAT_IRQ_CONFIG:
/* The IRQ settings will not apply to the target controller */
case NVME_FEAT_HOST_MEM_BUF:
/*
* Any HMB that's set will not be passed through and will
* not work as expected
*/
case NVME_FEAT_SW_PROGRESS:
/*
* The Pre-Boot Software Load Count doesn't make much
* sense for a target to export
*/
case NVME_FEAT_RESV_MASK:
case NVME_FEAT_RESV_PERSIST:
/* No reservations, see nvmet_parse_passthru_io_cmd() */
default:
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
{
/*
* Passthru all vendor specific commands
*/
if (req->cmd->common.opcode >= nvme_admin_vendor_start)
return nvmet_setup_passthru_command(req);
switch (req->cmd->common.opcode) {
case nvme_admin_async_event:
req->execute = nvmet_execute_async_event;
return NVME_SC_SUCCESS;
case nvme_admin_keep_alive:
/*
* Most PCIe ctrls don't support keep alive cmd, we route keep
* alive to the non-passthru mode. In future please change this
* code when PCIe ctrls with keep alive support available.
*/
req->execute = nvmet_execute_keep_alive;
return NVME_SC_SUCCESS;
case nvme_admin_set_features:
switch (le32_to_cpu(req->cmd->features.fid)) {
case NVME_FEAT_ASYNC_EVENT:
case NVME_FEAT_KATO:
case NVME_FEAT_NUM_QUEUES:
case NVME_FEAT_HOST_ID:
req->execute = nvmet_execute_set_features;
return NVME_SC_SUCCESS;
case NVME_FEAT_HOST_BEHAVIOR:
req->execute = nvmet_passthru_set_host_behaviour;
return NVME_SC_SUCCESS;
default:
return nvmet_passthru_get_set_features(req);
}
break;
case nvme_admin_get_features:
switch (le32_to_cpu(req->cmd->features.fid)) {
case NVME_FEAT_ASYNC_EVENT:
case NVME_FEAT_KATO:
case NVME_FEAT_NUM_QUEUES:
case NVME_FEAT_HOST_ID:
req->execute = nvmet_execute_get_features;
return NVME_SC_SUCCESS;
default:
return nvmet_passthru_get_set_features(req);
}
break;
case nvme_admin_identify:
switch (req->cmd->identify.cns) {
case NVME_ID_CNS_CTRL:
req->execute = nvmet_passthru_execute_cmd;
req->p.use_workqueue = true;
return NVME_SC_SUCCESS;
case NVME_ID_CNS_NS:
req->execute = nvmet_passthru_execute_cmd;
req->p.use_workqueue = true;
return NVME_SC_SUCCESS;
default:
return nvmet_setup_passthru_command(req);
}
case nvme_admin_get_log_page:
return nvmet_setup_passthru_command(req);
default:
/* Reject commands not in the allowlist above */
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
{
struct nvme_ctrl *ctrl;
int ret = -EINVAL;
void *old;
mutex_lock(&subsys->lock);
if (!subsys->passthru_ctrl_path)
goto out_unlock;
if (subsys->passthru_ctrl)
goto out_unlock;
if (subsys->nr_namespaces) {
pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
goto out_unlock;
}
ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
pr_err("failed to open nvme controller %s\n",
subsys->passthru_ctrl_path);
goto out_unlock;
}
old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL,
subsys, GFP_KERNEL);
if (xa_is_err(old)) {
ret = xa_err(old);
goto out_put_ctrl;
}
if (old)
goto out_put_ctrl;
subsys->passthru_ctrl = ctrl;
subsys->ver = ctrl->vs;
if (subsys->ver < NVME_VS(1, 2, 1)) {
pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n",
NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver),
NVME_TERTIARY(subsys->ver));
subsys->ver = NVME_VS(1, 2, 1);
}
mutex_unlock(&subsys->lock);
return 0;
out_put_ctrl:
nvme_put_ctrl(ctrl);
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
}
static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
{
if (subsys->passthru_ctrl) {
xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid);
nvme_put_ctrl(subsys->passthru_ctrl);
}
subsys->passthru_ctrl = NULL;
subsys->ver = NVMET_DEFAULT_VS;
}
void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
{
mutex_lock(&subsys->lock);
__nvmet_passthru_ctrl_disable(subsys);
mutex_unlock(&subsys->lock);
}
void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
{
mutex_lock(&subsys->lock);
__nvmet_passthru_ctrl_disable(subsys);
mutex_unlock(&subsys->lock);
kfree(subsys->passthru_ctrl_path);
}

View File

@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
u16 status = 0;
WARN_ON(rsp->n_rdma <= 0);
@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_cmd *cmd =
container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
struct nvmet_rdma_rsp *rsp;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
*/
nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
queue->cq = ib_alloc_cq(ndev->device, queue,
nr_cqe + 1, queue->comp_vector,
IB_POLL_WORKQUEUE);
queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
queue->comp_vector, IB_POLL_WORKQUEUE);
if (IS_ERR(queue->cq)) {
ret = PTR_ERR(queue->cq);
pr_err("failed to create CQ cqe= %d ret= %d\n",
@ -1322,7 +1321,7 @@ out:
err_destroy_qp:
rdma_destroy_qp(queue->cm_id);
err_destroy_cq:
ib_free_cq(queue->cq);
ib_cq_pool_put(queue->cq, nr_cqe + 1);
goto out;
}
@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
if (queue->cm_id)
rdma_destroy_id(queue->cm_id);
ib_destroy_qp(queue->qp);
ib_free_cq(queue->cq);
ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
queue->send_queue_size + 1);
}
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_RDMA,
.msdbd = 1,
.has_keyed_sgls = 1,
.metadata_support = 1,
.flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED,
.add_port = nvmet_rdma_add_port,
.remove_port = nvmet_rdma_remove_port,
.queue_response = nvmet_rdma_queue_response,

View File

@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
{
struct llist_node *node;
struct nvmet_tcp_cmd *cmd;
node = llist_del_all(&queue->resp_list);
if (!node)
return;
while (node) {
struct nvmet_tcp_cmd *cmd = llist_entry(node,
struct nvmet_tcp_cmd, lentry);
for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
list_add(&cmd->entry, &queue->resp_send_list);
node = node->next;
queue->send_list_len++;
}
}
@ -1717,7 +1711,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_TCP,
.msdbd = 1,
.has_keyed_sgls = 0,
.add_port = nvmet_tcp_add_port,
.remove_port = nvmet_tcp_remove_port,
.queue_response = nvmet_tcp_queue_response,

View File

@ -319,7 +319,7 @@ dasd_diag_check_device(struct dasd_device *device)
struct dasd_diag_characteristics *rdc_data;
struct vtoc_cms_label *label;
struct dasd_block *block;
struct dasd_diag_bio bio;
struct dasd_diag_bio *bio;
unsigned int sb, bsize;
blocknum_t end_block;
int rc;
@ -395,29 +395,36 @@ dasd_diag_check_device(struct dasd_device *device)
rc = -ENOMEM;
goto out;
}
bio = kzalloc(sizeof(*bio), GFP_KERNEL);
if (bio == NULL) {
DBF_DEV_EVENT(DBF_WARNING, device, "%s",
"No memory to allocate initialization bio");
rc = -ENOMEM;
goto out_label;
}
rc = 0;
end_block = 0;
/* try all sizes - needed for ECKD devices */
for (bsize = 512; bsize <= PAGE_SIZE; bsize <<= 1) {
mdsk_init_io(device, bsize, 0, &end_block);
memset(&bio, 0, sizeof (struct dasd_diag_bio));
bio.type = MDSK_READ_REQ;
bio.block_number = private->pt_block + 1;
bio.buffer = label;
memset(bio, 0, sizeof(*bio));
bio->type = MDSK_READ_REQ;
bio->block_number = private->pt_block + 1;
bio->buffer = label;
memset(&private->iob, 0, sizeof (struct dasd_diag_rw_io));
private->iob.dev_nr = rdc_data->dev_nr;
private->iob.key = 0;
private->iob.flags = 0; /* do synchronous io */
private->iob.block_count = 1;
private->iob.interrupt_params = 0;
private->iob.bio_list = &bio;
private->iob.bio_list = bio;
private->iob.flaga = DASD_DIAG_FLAGA_DEFAULT;
rc = dia250(&private->iob, RW_BIO);
if (rc == 3) {
pr_warn("%s: A 64-bit DIAG call failed\n",
dev_name(&device->cdev->dev));
rc = -EOPNOTSUPP;
goto out_label;
goto out_bio;
}
mdsk_term_io(device);
if (rc == 0)
@ -427,7 +434,7 @@ dasd_diag_check_device(struct dasd_device *device)
pr_warn("%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n",
dev_name(&device->cdev->dev), rc);
rc = -EIO;
goto out_label;
goto out_bio;
}
/* check for label block */
if (memcmp(label->label_id, DASD_DIAG_CMS1,
@ -457,6 +464,8 @@ dasd_diag_check_device(struct dasd_device *device)
(rc == 4) ? ", read-only device" : "");
rc = 0;
}
out_bio:
kfree(bio);
out_label:
free_page((long) label);
out:
@ -506,7 +515,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
struct req_iterator iter;
struct bio_vec bv;
char *dst;
unsigned int count, datasize;
unsigned int count;
sector_t recid, first_rec, last_rec;
unsigned int blksize, off;
unsigned char rw_cmd;
@ -534,10 +543,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
if (count != last_rec - first_rec + 1)
return ERR_PTR(-EINVAL);
/* Build the request */
datasize = sizeof(struct dasd_diag_req) +
count*sizeof(struct dasd_diag_bio);
cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev,
blk_mq_rq_to_pdu(req));
cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, struct_size(dreq, bio, count),
memdev, blk_mq_rq_to_pdu(req));
if (IS_ERR(cqr))
return cqr;

View File

@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
zone.non_seq = 1;
zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8]));
zone.capacity = zone.len;
zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16]));
zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
if (zone.type != ZBC_ZONE_TYPE_CONV &&
@ -716,6 +717,11 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
if (sdkp->zones_max_open == U32_MAX)
blk_queue_max_open_zones(q, 0);
else
blk_queue_max_open_zones(q, sdkp->zones_max_open);
blk_queue_max_active_zones(q, 0);
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
/* READ16/WRITE16 is mandatory for ZBC disks */

View File

@ -513,6 +513,8 @@ struct request_queue {
unsigned int nr_zones;
unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock;
unsigned int max_open_zones;
unsigned int max_active_zones;
#endif /* CONFIG_BLK_DEV_ZONED */
/*
@ -722,6 +724,28 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
return true;
return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
}
static inline void blk_queue_max_open_zones(struct request_queue *q,
unsigned int max_open_zones)
{
q->max_open_zones = max_open_zones;
}
static inline unsigned int queue_max_open_zones(const struct request_queue *q)
{
return q->max_open_zones;
}
static inline void blk_queue_max_active_zones(struct request_queue *q,
unsigned int max_active_zones)
{
q->max_active_zones = max_active_zones;
}
static inline unsigned int queue_max_active_zones(const struct request_queue *q)
{
return q->max_active_zones;
}
#else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
{
@ -737,6 +761,14 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
{
return 0;
}
static inline unsigned int queue_max_open_zones(const struct request_queue *q)
{
return 0;
}
static inline unsigned int queue_max_active_zones(const struct request_queue *q)
{
return 0;
}
#endif /* CONFIG_BLK_DEV_ZONED */
static inline bool rq_is_sync(struct request *rq)
@ -1520,6 +1552,24 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
return 0;
}
static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
if (q)
return queue_max_open_zones(q);
return 0;
}
static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
if (q)
return queue_max_active_zones(q);
return 0;
}
static inline int queue_dma_alignment(const struct request_queue *q)
{
return q ? q->dma_alignment : 511;

View File

@ -672,7 +672,7 @@ enum {
* Values set by the LLDD indicating completion status of the FCP operation.
* Must be set prior to calling the done() callback.
* @transferred_length: amount of DATA_OUT payload data received by a
* a WRITEDATA operation. If not a WRITEDATA operation, value must
* WRITEDATA operation. If not a WRITEDATA operation, value must
* be set to 0. Should equal transfer_length on success.
* @fcp_error: status of the FCP operation. Must be 0 on success; on failure
* must be a NVME_SC_FC_xxxx value.

View File

@ -132,6 +132,7 @@ enum {
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff)
#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf)
@ -162,7 +163,6 @@ enum {
enum {
NVME_CC_ENABLE = 1 << 0,
NVME_CC_CSS_NVM = 0 << 4,
NVME_CC_EN_SHIFT = 0,
NVME_CC_CSS_SHIFT = 4,
NVME_CC_MPS_SHIFT = 7,
@ -170,6 +170,9 @@ enum {
NVME_CC_SHN_SHIFT = 14,
NVME_CC_IOSQES_SHIFT = 16,
NVME_CC_IOCQES_SHIFT = 20,
NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT,
NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT,
NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT,
NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT,
NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT,
NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT,
@ -179,6 +182,8 @@ enum {
NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT,
NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
NVME_CAP_CSS_NVM = 1 << 0,
NVME_CAP_CSS_CSI = 1 << 6,
NVME_CSTS_RDY = 1 << 0,
NVME_CSTS_CFS = 1 << 1,
NVME_CSTS_NSSRO = 1 << 4,
@ -307,6 +312,7 @@ enum {
NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1,
NVME_CTRL_ONCS_DSM = 1 << 2,
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_ONCS_RESERVATIONS = 1 << 5,
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
@ -369,11 +375,37 @@ struct nvme_id_ns {
__u8 vs[3712];
};
struct nvme_zns_lbafe {
__le64 zsze;
__u8 zdes;
__u8 rsvd9[7];
};
struct nvme_id_ns_zns {
__le16 zoc;
__le16 ozcs;
__le32 mar;
__le32 mor;
__le32 rrl;
__le32 frl;
__u8 rsvd20[2796];
struct nvme_zns_lbafe lbafe[16];
__u8 rsvd3072[768];
__u8 vs[256];
};
struct nvme_id_ctrl_zns {
__u8 zasl;
__u8 rsvd1[4095];
};
enum {
NVME_ID_CNS_NS = 0x00,
NVME_ID_CNS_CTRL = 0x01,
NVME_ID_CNS_NS_ACTIVE_LIST = 0x02,
NVME_ID_CNS_NS_DESC_LIST = 0x03,
NVME_ID_CNS_CS_NS = 0x05,
NVME_ID_CNS_CS_CTRL = 0x06,
NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
NVME_ID_CNS_NS_PRESENT = 0x11,
NVME_ID_CNS_CTRL_NS_LIST = 0x12,
@ -383,6 +415,11 @@ enum {
NVME_ID_CNS_UUID_LIST = 0x17,
};
enum {
NVME_CSI_NVM = 0,
NVME_CSI_ZNS = 2,
};
enum {
NVME_DIR_IDENTIFY = 0x00,
NVME_DIR_STREAMS = 0x01,
@ -435,11 +472,13 @@ struct nvme_ns_id_desc {
#define NVME_NIDT_EUI64_LEN 8
#define NVME_NIDT_NGUID_LEN 16
#define NVME_NIDT_UUID_LEN 16
#define NVME_NIDT_CSI_LEN 1
enum {
NVME_NIDT_EUI64 = 0x01,
NVME_NIDT_NGUID = 0x02,
NVME_NIDT_UUID = 0x03,
NVME_NIDT_CSI = 0x04,
};
struct nvme_smart_log {
@ -519,6 +558,27 @@ struct nvme_ana_rsp_hdr {
__le16 rsvd10[3];
};
struct nvme_zone_descriptor {
__u8 zt;
__u8 zs;
__u8 za;
__u8 rsvd3[5];
__le64 zcap;
__le64 zslba;
__le64 wp;
__u8 rsvd32[32];
};
enum {
NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2,
};
struct nvme_zone_report {
__le64 nr_zones;
__u8 resv8[56];
struct nvme_zone_descriptor entries[];
};
enum {
NVME_SMART_CRIT_SPARE = 1 << 0,
NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
@ -613,6 +673,9 @@ enum nvme_opcode {
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
nvme_cmd_resv_release = 0x15,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
nvme_cmd_zone_append = 0x7d,
};
#define nvme_opcode_name(opcode) { opcode, #opcode }
@ -751,6 +814,7 @@ struct nvme_rw_command {
enum {
NVME_RW_LR = 1 << 15,
NVME_RW_FUA = 1 << 14,
NVME_RW_APPEND_PIREMAP = 1 << 9,
NVME_RW_DSM_FREQ_UNSPEC = 0,
NVME_RW_DSM_FREQ_TYPICAL = 1,
NVME_RW_DSM_FREQ_RARE = 2,
@ -816,6 +880,53 @@ struct nvme_write_zeroes_cmd {
__le16 appmask;
};
enum nvme_zone_mgmt_action {
NVME_ZONE_CLOSE = 0x1,
NVME_ZONE_FINISH = 0x2,
NVME_ZONE_OPEN = 0x3,
NVME_ZONE_RESET = 0x4,
NVME_ZONE_OFFLINE = 0x5,
NVME_ZONE_SET_DESC_EXT = 0x10,
};
struct nvme_zone_mgmt_send_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__le32 cdw2[2];
__le64 metadata;
union nvme_data_ptr dptr;
__le64 slba;
__le32 cdw12;
__u8 zsa;
__u8 select_all;
__u8 rsvd13[2];
__le32 cdw14[2];
};
struct nvme_zone_mgmt_recv_cmd {
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__le64 rsvd2[2];
union nvme_data_ptr dptr;
__le64 slba;
__le32 numd;
__u8 zra;
__u8 zrasf;
__u8 pr;
__u8 rsvd13;
__le32 cdw14[2];
};
enum {
NVME_ZRA_ZONE_REPORT = 0,
NVME_ZRASF_ZONE_REPORT_ALL = 0,
NVME_REPORT_ZONE_PARTIAL = 1,
};
/* Features */
enum {
@ -872,6 +983,7 @@ enum nvme_admin_opcode {
nvme_admin_security_recv = 0x82,
nvme_admin_sanitize_nvm = 0x84,
nvme_admin_get_lba_status = 0x86,
nvme_admin_vendor_start = 0xC0,
};
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
@ -935,6 +1047,8 @@ enum {
NVME_FEAT_RESV_MASK = 0x82,
NVME_FEAT_RESV_PERSIST = 0x83,
NVME_FEAT_WRITE_PROTECT = 0x84,
NVME_FEAT_VENDOR_START = 0xC0,
NVME_FEAT_VENDOR_END = 0xFF,
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
@ -972,7 +1086,9 @@ struct nvme_identify {
__u8 cns;
__u8 rsvd3;
__le16 ctrlid;
__u32 rsvd11[5];
__u8 rsvd11[3];
__u8 csi;
__u32 rsvd12[4];
};
#define NVME_IDENTIFY_DATA_SIZE 4096
@ -1086,7 +1202,9 @@ struct nvme_get_log_page_command {
};
__le64 lpo;
};
__u32 rsvd14[2];
__u8 rsvd14[3];
__u8 csi;
__u32 rsvd15;
};
struct nvme_directive_cmd {
@ -1283,6 +1401,8 @@ struct nvme_command {
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
struct nvme_write_zeroes_cmd write_zeroes;
struct nvme_zone_mgmt_send_cmd zms;
struct nvme_zone_mgmt_recv_cmd zmr;
struct nvme_abort_cmd abort;
struct nvme_get_log_page_command get_log_page;
struct nvmf_common_command fabrics;
@ -1416,6 +1536,18 @@ enum {
NVME_SC_DISCOVERY_RESTART = 0x190,
NVME_SC_AUTH_REQUIRED = 0x191,
/*
* I/O Command Set Specific - Zoned commands:
*/
NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8,
NVME_SC_ZONE_FULL = 0x1b9,
NVME_SC_ZONE_READ_ONLY = 0x1ba,
NVME_SC_ZONE_OFFLINE = 0x1bb,
NVME_SC_ZONE_INVALID_WRITE = 0x1bc,
NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd,
NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be,
NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf,
/*
* Media and Data Integrity Errors:
*/

View File

@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
* Version 3: Cache device with new UUID format
* Version 4: Backing device with data offset
*/
#define BCACHE_SB_VERSION_CDEV 0
#define BCACHE_SB_VERSION_BDEV 1
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
#define BCACHE_SB_MAX_VERSION 4
#define BCACHE_SB_VERSION_CDEV 0
#define BCACHE_SB_VERSION_BDEV 1
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5
#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6
#define BCACHE_SB_MAX_VERSION 6
#define SB_SECTOR 8
#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT)
@ -173,7 +175,12 @@ struct cache_sb_disk {
__le64 flags;
__le64 seq;
__le64 pad[8];
__le64 feature_compat;
__le64 feature_incompat;
__le64 feature_ro_compat;
__le64 pad[5];
union {
struct {
@ -206,10 +213,16 @@ struct cache_sb_disk {
__le16 keys;
};
__le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
__le16 bucket_size_hi;
};
/*
* This is for in-memory bcache super block.
* NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member
* size, ordering and even whole struct size may be different
* from cache_sb_disk.
*/
struct cache_sb {
__u64 csum;
__u64 offset; /* sector where this sb was written */
__u64 version;
@ -224,7 +237,10 @@ struct cache_sb {
__u64 flags;
__u64 seq;
__u64 pad[8];
__u64 feature_compat;
__u64 feature_incompat;
__u64 feature_ro_compat;
union {
struct {
@ -232,10 +248,9 @@ struct cache_sb {
__u64 nbuckets; /* device size */
__u16 block_size; /* sectors */
__u16 bucket_size; /* sectors */
__u16 nr_in_set;
__u16 nr_this_dev;
__u32 bucket_size; /* sectors */
};
struct {
/* Backing devices */
@ -262,7 +277,8 @@ struct cache_sb {
static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
{
return sb->version == BCACHE_SB_VERSION_BDEV
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES;
}
BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);

View File

@ -73,6 +73,15 @@ enum blk_zone_cond {
BLK_ZONE_COND_OFFLINE = 0xF,
};
/**
* enum blk_zone_report_flags - Feature flags of reported zone descriptors.
*
* @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field.
*/
enum blk_zone_report_flags {
BLK_ZONE_REP_CAPACITY = (1 << 0),
};
/**
* struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl.
*
@ -99,7 +108,9 @@ struct blk_zone {
__u8 cond; /* Zone condition */
__u8 non_seq; /* Non-sequential write resources active */
__u8 reset; /* Reset write pointer recommended */
__u8 reserved[36];
__u8 resv[4];
__u64 capacity; /* Zone capacity in number of sectors */
__u8 reserved[24];
};
/**
@ -115,7 +126,7 @@ struct blk_zone {
struct blk_zone_report {
__u64 sector;
__u32 nr_zones;
__u8 reserved[4];
__u32 flags;
struct blk_zone zones[0];
};

View File

@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s {
/*
* Notes:
* - if an array is being reshaped (restriped) in order to change the
* - if an array is being reshaped (restriped) in order to change
* the number of active devices in the array, 'raid_disks' will be
* the larger of the old and new numbers. 'delta_disks' will
* be the "new - old". So if +ve, raid_disks is the new value, and