for-5.9/drivers-20200803
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl8od3oQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgppkpD/9D+XqD9qYcYTj+ShVCc5+3RtMG5ZiAAX0y l4QXomentn/1Y0UYXFGJH7JLZWrKYT0QiktLtfpe5pmTqRUkckTIyJQlsHb+K6Dz lFjtywRK9pcFYgiWIUg80wlJKrTa8QdnrlS/Esn4YITKGRbgMIdFvq2jymXC+1ho RgodlgzcBUREgHSLo0H3cqEKA53fQiJhKC6CbFrFdrkpf2yUpcTfEDtpSwuIuPj3 2AUed1qXUtNjdHciCn3N37OuHqXKAA9noXAWfg9Gx/5zfGUNX9QJvlsny1AopgS0 jJvPSDVAhu/qRLHW6q/ZOT0JAlHegguuTAOtgMh2cMpAS5sumCAtltxVcI7Qnx41 HalMpTefXsVoBo0gfjqldnIPt34ZNj5aH5GYaH/wPpSg6VkTVBJK8GuQDBvg27qT w+U/T6EzuqniWXh/P3COhfrMCR9ueUOY1qWCRwzomlpeIfBhCzidt2wUqIxX1TOA Q0Ltf0eERDevsZbE+tIm+VAAg98kHehcS2t8lfFYFO6/PKu2iJpJt/HtJbZNBE+W rm96E4qXRiy1UuL7D9vBkaWsbnosuNHgGQXx57GlokQU+2IGBmOxV52XHiSxxpXd AS1ZTd56ItmID8VaU09Pbf7ZFbiCgdEAxIbUFzaCuvo+lxryHFphIUARNi/zPnNT UC2OzunCqA== =oADH -----END PGP SIGNATURE----- Merge tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block Pull block driver updates from Jens Axboe: - NVMe: - ZNS support (Aravind, Keith, Matias, Niklas) - Misc cleanups, optimizations, fixes (Baolin, Chaitanya, David, Dongli, Max, Sagi) - null_blk zone capacity support (Aravind) - MD: - raid5/6 fixes (ChangSyun) - Warning fixes (Damien) - raid5 stripe fixes (Guoqing, Song, Yufen) - sysfs deadlock fix (Junxiao) - raid10 deadlock fix (Vitaly) - struct_size conversions (Gustavo) - Set of bcache updates/fixes (Coly) * tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block: (117 commits) md/raid5: Allow degraded raid6 to do rmw md/raid5: Fix Force reconstruct-write io stuck in degraded raid5 raid5: don't duplicate code for different paths in handle_stripe raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show md: print errno in super_written md/raid5: remove the redundant setting of STRIPE_HANDLE md: register new md sysfs file 'uuid' read-only md: fix max sectors calculation for super 1.0 nvme-loop: remove extra variable in create ctrl nvme-loop: set ctrl state connecting after init nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths nvme-multipath: fix logic for non-optimized paths nvme-rdma: fix controller reset hang during traffic nvme-tcp: fix controller reset hang during traffic nvmet: introduce the passthru Kconfig option nvmet: introduce the passthru configfs interface nvmet: Add passthru enable/disable helpers nvmet: add passthru code to process commands nvme: export nvme_find_get_ns() and nvme_put_ns() nvme: introduce nvme_ctrl_get_by_path() ...
This commit is contained in:
commit
e0fc99e21e
@ -273,6 +273,24 @@ Description:
|
||||
device ("host-aware" or "host-managed" zone model). For regular
|
||||
block devices, the value is always 0.
|
||||
|
||||
What: /sys/block/<disk>/queue/max_active_zones
|
||||
Date: July 2020
|
||||
Contact: Niklas Cassel <niklas.cassel@wdc.com>
|
||||
Description:
|
||||
For zoned block devices (zoned attribute indicating
|
||||
"host-managed" or "host-aware"), the sum of zones belonging to
|
||||
any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED,
|
||||
is limited by this value. If this value is 0, there is no limit.
|
||||
|
||||
What: /sys/block/<disk>/queue/max_open_zones
|
||||
Date: July 2020
|
||||
Contact: Niklas Cassel <niklas.cassel@wdc.com>
|
||||
Description:
|
||||
For zoned block devices (zoned attribute indicating
|
||||
"host-managed" or "host-aware"), the sum of zones belonging to
|
||||
any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN,
|
||||
is limited by this value. If this value is 0, there is no limit.
|
||||
|
||||
What: /sys/block/<disk>/queue/chunk_sectors
|
||||
Date: September 2016
|
||||
Contact: Hannes Reinecke <hare@suse.com>
|
||||
|
@ -426,6 +426,10 @@ All md devices contain:
|
||||
The accepted values when writing to this file are ``ppl`` and ``resync``,
|
||||
used to enable and disable PPL.
|
||||
|
||||
uuid
|
||||
This indicates the UUID of the array in the following format:
|
||||
xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||||
|
||||
|
||||
As component devices are added to an md array, they appear in the ``md``
|
||||
directory as new directories named::
|
||||
|
@ -117,6 +117,20 @@ Maximum number of elements in a DMA scatter/gather list with integrity
|
||||
data that will be submitted by the block layer core to the associated
|
||||
block driver.
|
||||
|
||||
max_active_zones (RO)
|
||||
---------------------
|
||||
For zoned block devices (zoned attribute indicating "host-managed" or
|
||||
"host-aware"), the sum of zones belonging to any of the zone states:
|
||||
EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value.
|
||||
If this value is 0, there is no limit.
|
||||
|
||||
max_open_zones (RO)
|
||||
-------------------
|
||||
For zoned block devices (zoned attribute indicating "host-managed" or
|
||||
"host-aware"), the sum of zones belonging to any of the zone states:
|
||||
EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value.
|
||||
If this value is 0, there is no limit.
|
||||
|
||||
max_sectors_kb (RW)
|
||||
-------------------
|
||||
This is the maximum number of kilobytes that the block layer will allow
|
||||
|
@ -86,9 +86,10 @@ config BLK_DEV_ZONED
|
||||
select MQ_IOSCHED_DEADLINE
|
||||
help
|
||||
Block layer zoned block device support. This option enables
|
||||
support for ZAC/ZBC host-managed and host-aware zoned block devices.
|
||||
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
|
||||
devices.
|
||||
|
||||
Say yes here if you have a ZAC or ZBC storage device.
|
||||
Say yes here if you have a ZAC, ZBC, or ZNS storage device.
|
||||
|
||||
config BLK_DEV_THROTTLING
|
||||
bool "Block layer bio throttling support"
|
||||
|
@ -306,6 +306,16 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
|
||||
return queue_var_show(blk_queue_nr_zones(q), page);
|
||||
}
|
||||
|
||||
static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
|
||||
{
|
||||
return queue_var_show(queue_max_open_zones(q), page);
|
||||
}
|
||||
|
||||
static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
|
||||
{
|
||||
return queue_var_show(queue_max_active_zones(q), page);
|
||||
}
|
||||
|
||||
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
|
||||
{
|
||||
return queue_var_show((blk_queue_nomerges(q) << 1) |
|
||||
@ -668,6 +678,16 @@ static struct queue_sysfs_entry queue_nr_zones_entry = {
|
||||
.show = queue_nr_zones_show,
|
||||
};
|
||||
|
||||
static struct queue_sysfs_entry queue_max_open_zones_entry = {
|
||||
.attr = {.name = "max_open_zones", .mode = 0444 },
|
||||
.show = queue_max_open_zones_show,
|
||||
};
|
||||
|
||||
static struct queue_sysfs_entry queue_max_active_zones_entry = {
|
||||
.attr = {.name = "max_active_zones", .mode = 0444 },
|
||||
.show = queue_max_active_zones_show,
|
||||
};
|
||||
|
||||
static struct queue_sysfs_entry queue_nomerges_entry = {
|
||||
.attr = {.name = "nomerges", .mode = 0644 },
|
||||
.show = queue_nomerges_show,
|
||||
@ -766,6 +786,8 @@ static struct attribute *queue_attrs[] = {
|
||||
&queue_nonrot_entry.attr,
|
||||
&queue_zoned_entry.attr,
|
||||
&queue_nr_zones_entry.attr,
|
||||
&queue_max_open_zones_entry.attr,
|
||||
&queue_max_active_zones_entry.attr,
|
||||
&queue_nomerges_entry.attr,
|
||||
&queue_rq_affinity_entry.attr,
|
||||
&queue_iostats_entry.attr,
|
||||
@ -793,6 +815,11 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
|
||||
(!q->mq_ops || !q->mq_ops->timeout))
|
||||
return 0;
|
||||
|
||||
if ((attr == &queue_max_open_zones_entry.attr ||
|
||||
attr == &queue_max_active_zones_entry.attr) &&
|
||||
!blk_queue_is_zoned(q))
|
||||
return 0;
|
||||
|
||||
return attr->mode;
|
||||
}
|
||||
|
||||
|
@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
return ret;
|
||||
|
||||
rep.nr_zones = ret;
|
||||
rep.flags = BLK_ZONE_REP_CAPACITY;
|
||||
if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
|
@ -45,6 +45,9 @@ static const guid_t prp_guids[] = {
|
||||
/* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */
|
||||
GUID_INIT(0x6c501103, 0xc189, 0x4296,
|
||||
0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d),
|
||||
/* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */
|
||||
GUID_INIT(0x5025030f, 0x842f, 0x4ab4,
|
||||
0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0),
|
||||
};
|
||||
|
||||
/* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */
|
||||
|
@ -49,6 +49,7 @@ struct nullb_device {
|
||||
unsigned long completion_nsec; /* time in ns to complete a request */
|
||||
unsigned long cache_size; /* disk cache size in MB */
|
||||
unsigned long zone_size; /* zone size in MB if device is zoned */
|
||||
unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
|
||||
unsigned int zone_nr_conv; /* number of conventional zones */
|
||||
unsigned int submit_queues; /* number of submission queues */
|
||||
unsigned int home_node; /* home node for the device */
|
||||
|
@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256;
|
||||
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
|
||||
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
|
||||
|
||||
static unsigned long g_zone_capacity;
|
||||
module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
|
||||
MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
|
||||
|
||||
static unsigned int g_zone_nr_conv;
|
||||
module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
|
||||
MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
|
||||
@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL);
|
||||
NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
|
||||
NULLB_DEVICE_ATTR(zoned, bool, NULL);
|
||||
NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
|
||||
NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
|
||||
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
|
||||
|
||||
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
|
||||
@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
|
||||
&nullb_device_attr_badblocks,
|
||||
&nullb_device_attr_zoned,
|
||||
&nullb_device_attr_zone_size,
|
||||
&nullb_device_attr_zone_capacity,
|
||||
&nullb_device_attr_zone_nr_conv,
|
||||
NULL,
|
||||
};
|
||||
@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
|
||||
|
||||
static ssize_t memb_group_features_show(struct config_item *item, char *page)
|
||||
{
|
||||
return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n");
|
||||
return snprintf(page, PAGE_SIZE,
|
||||
"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
|
||||
}
|
||||
|
||||
CONFIGFS_ATTR_RO(memb_group_, features);
|
||||
@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void)
|
||||
dev->use_per_node_hctx = g_use_per_node_hctx;
|
||||
dev->zoned = g_zoned;
|
||||
dev->zone_size = g_zone_size;
|
||||
dev->zone_capacity = g_zone_capacity;
|
||||
dev->zone_nr_conv = g_zone_nr_conv;
|
||||
return dev;
|
||||
}
|
||||
|
@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!dev->zone_capacity)
|
||||
dev->zone_capacity = dev->zone_size;
|
||||
|
||||
if (dev->zone_capacity > dev->zone_size) {
|
||||
pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n",
|
||||
dev->zone_capacity, dev->zone_size);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
|
||||
dev->nr_zones = dev_size >>
|
||||
(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
|
||||
@ -47,6 +56,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
|
||||
|
||||
zone->start = sector;
|
||||
zone->len = dev->zone_size_sects;
|
||||
zone->capacity = zone->len;
|
||||
zone->wp = zone->start + zone->len;
|
||||
zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
|
||||
zone->cond = BLK_ZONE_COND_NOT_WP;
|
||||
@ -59,6 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
|
||||
|
||||
zone->start = zone->wp = sector;
|
||||
zone->len = dev->zone_size_sects;
|
||||
zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT;
|
||||
zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
|
||||
zone->cond = BLK_ZONE_COND_EMPTY;
|
||||
|
||||
@ -185,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
return BLK_STS_IOERR;
|
||||
}
|
||||
|
||||
if (zone->wp + nr_sectors > zone->start + zone->capacity)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
|
||||
zone->cond = BLK_ZONE_COND_IMP_OPEN;
|
||||
|
||||
@ -193,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
|
||||
return ret;
|
||||
|
||||
zone->wp += nr_sectors;
|
||||
if (zone->wp == zone->start + zone->len)
|
||||
if (zone->wp == zone->start + zone->capacity)
|
||||
zone->cond = BLK_ZONE_COND_FULL;
|
||||
return BLK_STS_OK;
|
||||
default:
|
||||
|
@ -562,13 +562,15 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
|
||||
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
if (card->ctrl[i].status.buf)
|
||||
pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
|
||||
card->ctrl[i].status.buf,
|
||||
card->ctrl[i].status.dma_addr);
|
||||
dma_free_coherent(&card->dev->dev,
|
||||
STATUS_BUFFER_SIZE8,
|
||||
card->ctrl[i].status.buf,
|
||||
card->ctrl[i].status.dma_addr);
|
||||
if (card->ctrl[i].cmd.buf)
|
||||
pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
|
||||
card->ctrl[i].cmd.buf,
|
||||
card->ctrl[i].cmd.dma_addr);
|
||||
dma_free_coherent(&card->dev->dev,
|
||||
COMMAND_BUFFER_SIZE8,
|
||||
card->ctrl[i].cmd.buf,
|
||||
card->ctrl[i].cmd.dma_addr);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -711,15 +713,15 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
|
||||
failed_hw_buffers_init:
|
||||
for (i = 0; i < card->n_targets; i++) {
|
||||
if (card->ctrl[i].status.buf)
|
||||
pci_free_consistent(card->dev,
|
||||
STATUS_BUFFER_SIZE8,
|
||||
card->ctrl[i].status.buf,
|
||||
card->ctrl[i].status.dma_addr);
|
||||
dma_free_coherent(&card->dev->dev,
|
||||
STATUS_BUFFER_SIZE8,
|
||||
card->ctrl[i].status.buf,
|
||||
card->ctrl[i].status.dma_addr);
|
||||
if (card->ctrl[i].cmd.buf)
|
||||
pci_free_consistent(card->dev,
|
||||
COMMAND_BUFFER_SIZE8,
|
||||
card->ctrl[i].cmd.buf,
|
||||
card->ctrl[i].cmd.dma_addr);
|
||||
dma_free_coherent(&card->dev->dev,
|
||||
COMMAND_BUFFER_SIZE8,
|
||||
card->ctrl[i].cmd.buf,
|
||||
card->ctrl[i].cmd.dma_addr);
|
||||
}
|
||||
failed_hw_setup:
|
||||
rsxx_eeh_failure(dev);
|
||||
|
@ -27,7 +27,7 @@ config BCACHE_CLOSURES_DEBUG
|
||||
interface to list them, which makes it possible to see asynchronous
|
||||
operations that get stuck.
|
||||
|
||||
config BCACHE_ASYNC_REGISTRAION
|
||||
config BCACHE_ASYNC_REGISTRATION
|
||||
bool "Asynchronous device registration (EXPERIMENTAL)"
|
||||
depends on BCACHE
|
||||
help
|
||||
|
@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o
|
||||
|
||||
bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
|
||||
io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
|
||||
util.o writeback.o
|
||||
util.o writeback.o features.o
|
||||
|
@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
|
||||
{
|
||||
struct cache *ca;
|
||||
struct bucket *b;
|
||||
unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
|
||||
unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
|
||||
unsigned int i;
|
||||
int r;
|
||||
|
||||
|
@ -264,7 +264,7 @@ struct bcache_device {
|
||||
#define BCACHE_DEV_UNLINK_DONE 2
|
||||
#define BCACHE_DEV_WB_RUNNING 3
|
||||
#define BCACHE_DEV_RATE_DW_RUNNING 4
|
||||
unsigned int nr_stripes;
|
||||
int nr_stripes;
|
||||
unsigned int stripe_size;
|
||||
atomic_t *stripe_sectors_dirty;
|
||||
unsigned long *full_dirty_stripes;
|
||||
@ -762,11 +762,32 @@ struct bbio {
|
||||
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
|
||||
#define block_bytes(c) ((c)->sb.block_size << 9)
|
||||
|
||||
#define prios_per_bucket(c) \
|
||||
((bucket_bytes(c) - sizeof(struct prio_set)) / \
|
||||
static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
|
||||
{
|
||||
unsigned int n, max_pages;
|
||||
|
||||
max_pages = min_t(unsigned int,
|
||||
__rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
|
||||
MAX_ORDER_NR_PAGES);
|
||||
|
||||
n = sb->bucket_size / PAGE_SECTORS;
|
||||
if (n > max_pages)
|
||||
n = max_pages;
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
|
||||
{
|
||||
return meta_bucket_pages(sb) << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
#define prios_per_bucket(ca) \
|
||||
((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \
|
||||
sizeof(struct bucket_disk))
|
||||
#define prio_buckets(c) \
|
||||
DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
|
||||
|
||||
#define prio_buckets(ca) \
|
||||
DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
|
||||
|
||||
static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
|
||||
{
|
||||
|
@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b,
|
||||
|
||||
b->page_order = page_order;
|
||||
|
||||
t->data = (void *) __get_free_pages(gfp, b->page_order);
|
||||
t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order);
|
||||
if (!t->data)
|
||||
goto err;
|
||||
|
||||
|
@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c)
|
||||
if (c->verify_data)
|
||||
list_move(&c->verify_data->list, &c->btree_cache);
|
||||
|
||||
free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
|
||||
free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb)));
|
||||
#endif
|
||||
|
||||
list_splice(&c->btree_cache_freeable,
|
||||
@ -785,7 +785,15 @@ int bch_btree_cache_alloc(struct cache_set *c)
|
||||
mutex_init(&c->verify_lock);
|
||||
|
||||
c->verify_ondisk = (void *)
|
||||
__get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
|
||||
__get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb)));
|
||||
if (!c->verify_ondisk) {
|
||||
/*
|
||||
* Don't worry about the mca_rereserve buckets
|
||||
* allocated in previous for-loop, they will be
|
||||
* handled properly in bch_cache_set_unregister().
|
||||
*/
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
|
||||
|
||||
|
75
drivers/md/bcache/features.c
Normal file
75
drivers/md/bcache/features.c
Normal file
@ -0,0 +1,75 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Feature set bits and string conversion.
|
||||
* Inspired by ext4's features compat/incompat/ro_compat related code.
|
||||
*
|
||||
* Copyright 2020 Coly Li <colyli@suse.de>
|
||||
*
|
||||
*/
|
||||
#include <linux/bcache.h>
|
||||
#include "bcache.h"
|
||||
#include "features.h"
|
||||
|
||||
struct feature {
|
||||
int compat;
|
||||
unsigned int mask;
|
||||
const char *string;
|
||||
};
|
||||
|
||||
static struct feature feature_list[] = {
|
||||
{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET,
|
||||
"large_bucket"},
|
||||
{0, 0, 0 },
|
||||
};
|
||||
|
||||
#define compose_feature_string(type) \
|
||||
({ \
|
||||
struct feature *f; \
|
||||
bool first = true; \
|
||||
\
|
||||
for (f = &feature_list[0]; f->compat != 0; f++) { \
|
||||
if (f->compat != BCH_FEATURE_ ## type) \
|
||||
continue; \
|
||||
if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \
|
||||
if (first) { \
|
||||
out += snprintf(out, buf + size - out, \
|
||||
"["); \
|
||||
} else { \
|
||||
out += snprintf(out, buf + size - out, \
|
||||
" ["); \
|
||||
} \
|
||||
} else if (!first) { \
|
||||
out += snprintf(out, buf + size - out, " "); \
|
||||
} \
|
||||
\
|
||||
out += snprintf(out, buf + size - out, "%s", f->string);\
|
||||
\
|
||||
if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \
|
||||
out += snprintf(out, buf + size - out, "]"); \
|
||||
\
|
||||
first = false; \
|
||||
} \
|
||||
if (!first) \
|
||||
out += snprintf(out, buf + size - out, "\n"); \
|
||||
})
|
||||
|
||||
int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size)
|
||||
{
|
||||
char *out = buf;
|
||||
compose_feature_string(COMPAT);
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size)
|
||||
{
|
||||
char *out = buf;
|
||||
compose_feature_string(RO_COMPAT);
|
||||
return out - buf;
|
||||
}
|
||||
|
||||
int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size)
|
||||
{
|
||||
char *out = buf;
|
||||
compose_feature_string(INCOMPAT);
|
||||
return out - buf;
|
||||
}
|
86
drivers/md/bcache/features.h
Normal file
86
drivers/md/bcache/features.h
Normal file
@ -0,0 +1,86 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
|
||||
#ifndef _BCACHE_FEATURES_H
|
||||
#define _BCACHE_FEATURES_H
|
||||
|
||||
#include <linux/bcache.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define BCH_FEATURE_COMPAT 0
|
||||
#define BCH_FEATURE_RO_COMPAT 1
|
||||
#define BCH_FEATURE_INCOMPAT 2
|
||||
#define BCH_FEATURE_TYPE_MASK 0x03
|
||||
|
||||
/* Feature set definition */
|
||||
/* Incompat feature set */
|
||||
#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */
|
||||
|
||||
#define BCH_FEATURE_COMPAT_SUUP 0
|
||||
#define BCH_FEATURE_RO_COMPAT_SUUP 0
|
||||
#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET
|
||||
|
||||
#define BCH_HAS_COMPAT_FEATURE(sb, mask) \
|
||||
((sb)->feature_compat & (mask))
|
||||
#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \
|
||||
((sb)->feature_ro_compat & (mask))
|
||||
#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
|
||||
((sb)->feature_incompat & (mask))
|
||||
|
||||
#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
|
||||
static inline int bch_has_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
return (((sb)->feature_compat & \
|
||||
BCH##_FEATURE_COMPAT_##flagname) != 0); \
|
||||
} \
|
||||
static inline void bch_set_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_compat |= \
|
||||
BCH##_FEATURE_COMPAT_##flagname; \
|
||||
} \
|
||||
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_compat &= \
|
||||
~BCH##_FEATURE_COMPAT_##flagname; \
|
||||
}
|
||||
|
||||
#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
|
||||
static inline int bch_has_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
return (((sb)->feature_ro_compat & \
|
||||
BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
|
||||
} \
|
||||
static inline void bch_set_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_ro_compat |= \
|
||||
BCH##_FEATURE_RO_COMPAT_##flagname; \
|
||||
} \
|
||||
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_ro_compat &= \
|
||||
~BCH##_FEATURE_RO_COMPAT_##flagname; \
|
||||
}
|
||||
|
||||
#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
|
||||
static inline int bch_has_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
return (((sb)->feature_incompat & \
|
||||
BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
|
||||
} \
|
||||
static inline void bch_set_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_incompat |= \
|
||||
BCH##_FEATURE_INCOMPAT_##flagname; \
|
||||
} \
|
||||
static inline void bch_clear_feature_##name(struct cache_sb *sb) \
|
||||
{ \
|
||||
(sb)->feature_incompat &= \
|
||||
~BCH##_FEATURE_INCOMPAT_##flagname; \
|
||||
}
|
||||
|
||||
BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
|
||||
|
||||
int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size);
|
||||
int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size);
|
||||
int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size);
|
||||
|
||||
#endif
|
@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c)
|
||||
struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO);
|
||||
struct bio *bio = &b->bio;
|
||||
|
||||
bio_init(bio, bio->bi_inline_vecs, bucket_pages(c));
|
||||
bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb));
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
@ -217,10 +217,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
|
||||
*/
|
||||
pr_debug("falling back to linear search\n");
|
||||
|
||||
for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
|
||||
l < ca->sb.njournal_buckets;
|
||||
l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets,
|
||||
l + 1))
|
||||
for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets)
|
||||
if (read_bucket(l))
|
||||
goto bsearch;
|
||||
|
||||
@ -999,8 +996,8 @@ int bch_journal_alloc(struct cache_set *c)
|
||||
j->w[1].c = c;
|
||||
|
||||
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
|
||||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
|
||||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
|
||||
!(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) ||
|
||||
!(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
|
@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c)
|
||||
continue;
|
||||
}
|
||||
|
||||
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
|
||||
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
|
||||
io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs,
|
||||
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
|
||||
GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c)
|
||||
mutex_lock(&c->bucket_lock);
|
||||
|
||||
for_each_cache(ca, c, i) {
|
||||
unsigned int sectors_to_move = 0;
|
||||
unsigned int reserve_sectors = ca->sb.bucket_size *
|
||||
unsigned long sectors_to_move = 0;
|
||||
unsigned long reserve_sectors = ca->sb.bucket_size *
|
||||
fifo_used(&ca->free[RESERVE_MOVINGGC]);
|
||||
|
||||
ca->heap.used = 0;
|
||||
|
@ -668,7 +668,9 @@ static void backing_request_endio(struct bio *bio)
|
||||
static void bio_complete(struct search *s)
|
||||
{
|
||||
if (s->orig_bio) {
|
||||
bio_end_io_acct(s->orig_bio, s->start_time);
|
||||
/* Count on bcache device */
|
||||
disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time);
|
||||
|
||||
trace_bcache_request_end(s->d, s->orig_bio);
|
||||
s->orig_bio->bi_status = s->iop.status;
|
||||
bio_endio(s->orig_bio);
|
||||
@ -728,8 +730,8 @@ static inline struct search *search_alloc(struct bio *bio,
|
||||
s->recoverable = 1;
|
||||
s->write = op_is_write(bio_op(bio));
|
||||
s->read_dirty_data = 0;
|
||||
s->start_time = bio_start_io_acct(bio);
|
||||
|
||||
/* Count on the bcache device */
|
||||
s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
|
||||
s->iop.c = d->c;
|
||||
s->iop.bio = NULL;
|
||||
s->iop.inode = d->id;
|
||||
@ -1080,7 +1082,8 @@ static void detached_dev_end_io(struct bio *bio)
|
||||
bio->bi_end_io = ddip->bi_end_io;
|
||||
bio->bi_private = ddip->bi_private;
|
||||
|
||||
bio_end_io_acct(bio, ddip->start_time);
|
||||
/* Count on the bcache device */
|
||||
disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time);
|
||||
|
||||
if (bio->bi_status) {
|
||||
struct cached_dev *dc = container_of(ddip->d,
|
||||
@ -1105,7 +1108,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
|
||||
*/
|
||||
ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
|
||||
ddip->d = d;
|
||||
ddip->start_time = bio_start_io_acct(bio);
|
||||
/* Count on the bcache device */
|
||||
ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio));
|
||||
ddip->bi_end_io = bio->bi_end_io;
|
||||
ddip->bi_private = bio->bi_private;
|
||||
bio->bi_end_io = detached_dev_end_io;
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "extents.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
#include "features.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/debugfs.h>
|
||||
@ -59,6 +60,92 @@ struct workqueue_struct *bch_journal_wq;
|
||||
|
||||
/* Superblock */
|
||||
|
||||
static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
|
||||
{
|
||||
unsigned int bucket_size = le16_to_cpu(s->bucket_size);
|
||||
|
||||
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
|
||||
bch_has_feature_large_bucket(sb))
|
||||
bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
|
||||
|
||||
return bucket_size;
|
||||
}
|
||||
|
||||
static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
|
||||
struct cache_sb_disk *s)
|
||||
{
|
||||
const char *err;
|
||||
unsigned int i;
|
||||
|
||||
sb->first_bucket= le16_to_cpu(s->first_bucket);
|
||||
sb->nbuckets = le64_to_cpu(s->nbuckets);
|
||||
sb->bucket_size = get_bucket_size(sb, s);
|
||||
|
||||
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
|
||||
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
|
||||
|
||||
err = "Too many journal buckets";
|
||||
if (sb->keys > SB_JOURNAL_BUCKETS)
|
||||
goto err;
|
||||
|
||||
err = "Too many buckets";
|
||||
if (sb->nbuckets > LONG_MAX)
|
||||
goto err;
|
||||
|
||||
err = "Not enough buckets";
|
||||
if (sb->nbuckets < 1 << 7)
|
||||
goto err;
|
||||
|
||||
err = "Bad block size (not power of 2)";
|
||||
if (!is_power_of_2(sb->block_size))
|
||||
goto err;
|
||||
|
||||
err = "Bad block size (larger than page size)";
|
||||
if (sb->block_size > PAGE_SECTORS)
|
||||
goto err;
|
||||
|
||||
err = "Bad bucket size (not power of 2)";
|
||||
if (!is_power_of_2(sb->bucket_size))
|
||||
goto err;
|
||||
|
||||
err = "Bad bucket size (smaller than page size)";
|
||||
if (sb->bucket_size < PAGE_SECTORS)
|
||||
goto err;
|
||||
|
||||
err = "Invalid superblock: device too small";
|
||||
if (get_capacity(bdev->bd_disk) <
|
||||
sb->bucket_size * sb->nbuckets)
|
||||
goto err;
|
||||
|
||||
err = "Bad UUID";
|
||||
if (bch_is_zero(sb->set_uuid, 16))
|
||||
goto err;
|
||||
|
||||
err = "Bad cache device number in set";
|
||||
if (!sb->nr_in_set ||
|
||||
sb->nr_in_set <= sb->nr_this_dev ||
|
||||
sb->nr_in_set > MAX_CACHES_PER_SET)
|
||||
goto err;
|
||||
|
||||
err = "Journal buckets not sequential";
|
||||
for (i = 0; i < sb->keys; i++)
|
||||
if (sb->d[i] != sb->first_bucket + i)
|
||||
goto err;
|
||||
|
||||
err = "Too many journal buckets";
|
||||
if (sb->first_bucket + sb->keys > sb->nbuckets)
|
||||
goto err;
|
||||
|
||||
err = "Invalid superblock: first bucket comes before end of super";
|
||||
if (sb->first_bucket * sb->bucket_size < 16)
|
||||
goto err;
|
||||
|
||||
err = NULL;
|
||||
err:
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
struct cache_sb_disk **res)
|
||||
{
|
||||
@ -84,7 +171,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
sb->flags = le64_to_cpu(s->flags);
|
||||
sb->seq = le64_to_cpu(s->seq);
|
||||
sb->last_mount = le32_to_cpu(s->last_mount);
|
||||
sb->first_bucket = le16_to_cpu(s->first_bucket);
|
||||
sb->keys = le16_to_cpu(s->keys);
|
||||
|
||||
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
|
||||
@ -101,10 +187,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
if (memcmp(sb->magic, bcache_magic, 16))
|
||||
goto err;
|
||||
|
||||
err = "Too many journal buckets";
|
||||
if (sb->keys > SB_JOURNAL_BUCKETS)
|
||||
goto err;
|
||||
|
||||
err = "Bad checksum";
|
||||
if (s->csum != csum_set(s))
|
||||
goto err;
|
||||
@ -124,6 +206,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
sb->data_offset = BDEV_DATA_START_DEFAULT;
|
||||
break;
|
||||
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
|
||||
case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
|
||||
sb->data_offset = le64_to_cpu(s->data_offset);
|
||||
|
||||
err = "Bad data offset";
|
||||
@ -133,55 +216,21 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
||||
break;
|
||||
case BCACHE_SB_VERSION_CDEV:
|
||||
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
|
||||
sb->nbuckets = le64_to_cpu(s->nbuckets);
|
||||
sb->bucket_size = le16_to_cpu(s->bucket_size);
|
||||
|
||||
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
|
||||
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
|
||||
|
||||
err = "Too many buckets";
|
||||
if (sb->nbuckets > LONG_MAX)
|
||||
err = read_super_common(sb, bdev, s);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
err = "Not enough buckets";
|
||||
if (sb->nbuckets < 1 << 7)
|
||||
break;
|
||||
case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
|
||||
/*
|
||||
* Feature bits are needed in read_super_common(),
|
||||
* convert them firstly.
|
||||
*/
|
||||
sb->feature_compat = le64_to_cpu(s->feature_compat);
|
||||
sb->feature_incompat = le64_to_cpu(s->feature_incompat);
|
||||
sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
|
||||
err = read_super_common(sb, bdev, s);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
err = "Bad block/bucket size";
|
||||
if (!is_power_of_2(sb->block_size) ||
|
||||
sb->block_size > PAGE_SECTORS ||
|
||||
!is_power_of_2(sb->bucket_size) ||
|
||||
sb->bucket_size < PAGE_SECTORS)
|
||||
goto err;
|
||||
|
||||
err = "Invalid superblock: device too small";
|
||||
if (get_capacity(bdev->bd_disk) <
|
||||
sb->bucket_size * sb->nbuckets)
|
||||
goto err;
|
||||
|
||||
err = "Bad UUID";
|
||||
if (bch_is_zero(sb->set_uuid, 16))
|
||||
goto err;
|
||||
|
||||
err = "Bad cache device number in set";
|
||||
if (!sb->nr_in_set ||
|
||||
sb->nr_in_set <= sb->nr_this_dev ||
|
||||
sb->nr_in_set > MAX_CACHES_PER_SET)
|
||||
goto err;
|
||||
|
||||
err = "Journal buckets not sequential";
|
||||
for (i = 0; i < sb->keys; i++)
|
||||
if (sb->d[i] != sb->first_bucket + i)
|
||||
goto err;
|
||||
|
||||
err = "Too many journal buckets";
|
||||
if (sb->first_bucket + sb->keys > sb->nbuckets)
|
||||
goto err;
|
||||
|
||||
err = "Invalid superblock: first bucket comes before end of super";
|
||||
if (sb->first_bucket * sb->bucket_size < 16)
|
||||
goto err;
|
||||
|
||||
break;
|
||||
default:
|
||||
err = "Unsupported superblock version";
|
||||
@ -217,7 +266,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
|
||||
offset_in_page(out));
|
||||
|
||||
out->offset = cpu_to_le64(sb->offset);
|
||||
out->version = cpu_to_le64(sb->version);
|
||||
|
||||
memcpy(out->uuid, sb->uuid, 16);
|
||||
memcpy(out->set_uuid, sb->set_uuid, 16);
|
||||
@ -233,6 +281,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
|
||||
for (i = 0; i < sb->keys; i++)
|
||||
out->d[i] = cpu_to_le64(sb->d[i]);
|
||||
|
||||
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
|
||||
out->feature_compat = cpu_to_le64(sb->feature_compat);
|
||||
out->feature_incompat = cpu_to_le64(sb->feature_incompat);
|
||||
out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
|
||||
}
|
||||
|
||||
out->version = cpu_to_le64(sb->version);
|
||||
out->csum = csum_set(out);
|
||||
|
||||
pr_debug("ver %llu, flags %llu, seq %llu\n",
|
||||
@ -289,17 +344,20 @@ void bcache_write_super(struct cache_set *c)
|
||||
{
|
||||
struct closure *cl = &c->sb_write;
|
||||
struct cache *ca;
|
||||
unsigned int i;
|
||||
unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
|
||||
|
||||
down(&c->sb_write_mutex);
|
||||
closure_init(cl, &c->cl);
|
||||
|
||||
c->sb.seq++;
|
||||
|
||||
if (c->sb.version > version)
|
||||
version = c->sb.version;
|
||||
|
||||
for_each_cache(ca, c, i) {
|
||||
struct bio *bio = &ca->sb_bio;
|
||||
|
||||
ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
|
||||
ca->sb.version = version;
|
||||
ca->sb.seq = c->sb.seq;
|
||||
ca->sb.last_mount = c->sb.last_mount;
|
||||
|
||||
@ -423,6 +481,7 @@ static int __uuid_write(struct cache_set *c)
|
||||
BKEY_PADDED(key) k;
|
||||
struct closure cl;
|
||||
struct cache *ca;
|
||||
unsigned int size;
|
||||
|
||||
closure_init_stack(&cl);
|
||||
lockdep_assert_held(&bch_register_lock);
|
||||
@ -430,7 +489,8 @@ static int __uuid_write(struct cache_set *c)
|
||||
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
|
||||
return 1;
|
||||
|
||||
SET_KEY_SIZE(&k.key, c->sb.bucket_size);
|
||||
size = meta_bucket_pages(&c->sb) * PAGE_SECTORS;
|
||||
SET_KEY_SIZE(&k.key, size);
|
||||
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
|
||||
closure_sync(&cl);
|
||||
|
||||
@ -518,7 +578,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op,
|
||||
|
||||
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
|
||||
bio_set_dev(bio, ca->bdev);
|
||||
bio->bi_iter.bi_size = bucket_bytes(ca);
|
||||
bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
|
||||
|
||||
bio->bi_end_io = prio_endio;
|
||||
bio->bi_private = ca;
|
||||
@ -576,7 +636,7 @@ int bch_prio_write(struct cache *ca, bool wait)
|
||||
|
||||
p->next_bucket = ca->prio_buckets[i + 1];
|
||||
p->magic = pset_magic(&ca->sb);
|
||||
p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
|
||||
p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
|
||||
|
||||
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
|
||||
BUG_ON(bucket == -1);
|
||||
@ -629,7 +689,7 @@ static int prio_read(struct cache *ca, uint64_t bucket)
|
||||
prio_io(ca, bucket, REQ_OP_READ, 0);
|
||||
|
||||
if (p->csum !=
|
||||
bch_crc64(&p->magic, bucket_bytes(ca) - 8)) {
|
||||
bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
|
||||
pr_warn("bad csum reading priorities\n");
|
||||
goto out;
|
||||
}
|
||||
@ -835,19 +895,19 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
|
||||
struct request_queue *q;
|
||||
const size_t max_stripes = min_t(size_t, INT_MAX,
|
||||
SIZE_MAX / sizeof(atomic_t));
|
||||
size_t n;
|
||||
uint64_t n;
|
||||
int idx;
|
||||
|
||||
if (!d->stripe_size)
|
||||
d->stripe_size = 1 << 31;
|
||||
|
||||
d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
|
||||
|
||||
if (!d->nr_stripes || d->nr_stripes > max_stripes) {
|
||||
pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n",
|
||||
(unsigned int)d->nr_stripes);
|
||||
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
|
||||
if (!n || n > max_stripes) {
|
||||
pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
|
||||
n);
|
||||
return -ENOMEM;
|
||||
}
|
||||
d->nr_stripes = n;
|
||||
|
||||
n = d->nr_stripes * sizeof(atomic_t);
|
||||
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
|
||||
@ -1620,7 +1680,7 @@ static void cache_set_free(struct closure *cl)
|
||||
}
|
||||
|
||||
bch_bset_sort_state_free(&c->sort);
|
||||
free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
|
||||
free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb)));
|
||||
|
||||
if (c->moving_gc_wq)
|
||||
destroy_workqueue(c->moving_gc_wq);
|
||||
@ -1783,7 +1843,10 @@ void bch_cache_set_unregister(struct cache_set *c)
|
||||
}
|
||||
|
||||
#define alloc_bucket_pages(gfp, c) \
|
||||
((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
|
||||
((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c))))
|
||||
|
||||
#define alloc_meta_bucket_pages(gfp, sb) \
|
||||
((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
|
||||
|
||||
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
{
|
||||
@ -1814,12 +1877,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
c->sb.bucket_size = sb->bucket_size;
|
||||
c->sb.nr_in_set = sb->nr_in_set;
|
||||
c->sb.last_mount = sb->last_mount;
|
||||
c->sb.version = sb->version;
|
||||
if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
|
||||
c->sb.feature_compat = sb->feature_compat;
|
||||
c->sb.feature_ro_compat = sb->feature_ro_compat;
|
||||
c->sb.feature_incompat = sb->feature_incompat;
|
||||
}
|
||||
|
||||
c->bucket_bits = ilog2(sb->bucket_size);
|
||||
c->block_bits = ilog2(sb->block_size);
|
||||
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
|
||||
c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry);
|
||||
c->devices_max_used = 0;
|
||||
atomic_set(&c->attached_dev_nr, 0);
|
||||
c->btree_pages = bucket_pages(c);
|
||||
c->btree_pages = meta_bucket_pages(&c->sb);
|
||||
if (c->btree_pages > BTREE_MAX_PAGES)
|
||||
c->btree_pages = max_t(int, c->btree_pages / 4,
|
||||
BTREE_MAX_PAGES);
|
||||
@ -1845,24 +1915,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
||||
INIT_LIST_HEAD(&c->btree_cache_freed);
|
||||
INIT_LIST_HEAD(&c->data_buckets);
|
||||
|
||||
iter_size = (sb->bucket_size / sb->block_size + 1) *
|
||||
iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
|
||||
sizeof(struct btree_iter_set);
|
||||
|
||||
if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
|
||||
mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
|
||||
mempool_init_kmalloc_pool(&c->bio_meta, 2,
|
||||
sizeof(struct bbio) + sizeof(struct bio_vec) *
|
||||
bucket_pages(c)) ||
|
||||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
|
||||
bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
|
||||
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
|
||||
!(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
|
||||
!(c->moving_gc_wq = alloc_workqueue("bcache_gc",
|
||||
WQ_MEM_RECLAIM, 0)) ||
|
||||
bch_journal_alloc(c) ||
|
||||
bch_btree_cache_alloc(c) ||
|
||||
bch_open_buckets_alloc(c) ||
|
||||
bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
|
||||
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
|
||||
if (!c->devices)
|
||||
goto err;
|
||||
|
||||
if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
|
||||
goto err;
|
||||
|
||||
if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
|
||||
sizeof(struct bbio) +
|
||||
sizeof(struct bio_vec) * meta_bucket_pages(&c->sb)))
|
||||
goto err;
|
||||
|
||||
if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
|
||||
goto err;
|
||||
|
||||
if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
|
||||
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
|
||||
goto err;
|
||||
|
||||
c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb);
|
||||
if (!c->uuids)
|
||||
goto err;
|
||||
|
||||
c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
|
||||
if (!c->moving_gc_wq)
|
||||
goto err;
|
||||
|
||||
if (bch_journal_alloc(c))
|
||||
goto err;
|
||||
|
||||
if (bch_btree_cache_alloc(c))
|
||||
goto err;
|
||||
|
||||
if (bch_open_buckets_alloc(c))
|
||||
goto err;
|
||||
|
||||
if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
|
||||
goto err;
|
||||
|
||||
c->congested_read_threshold_us = 2000;
|
||||
@ -2107,7 +2199,14 @@ found:
|
||||
sysfs_create_link(&c->kobj, &ca->kobj, buf))
|
||||
goto err;
|
||||
|
||||
if (ca->sb.seq > c->sb.seq) {
|
||||
/*
|
||||
* A special case is both ca->sb.seq and c->sb.seq are 0,
|
||||
* such condition happens on a new created cache device whose
|
||||
* super block is never flushed yet. In this case c->sb.version
|
||||
* and other members should be updated too, otherwise we will
|
||||
* have a mistaken super block version in cache set.
|
||||
*/
|
||||
if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) {
|
||||
c->sb.version = ca->sb.version;
|
||||
memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
|
||||
c->sb.flags = ca->sb.flags;
|
||||
@ -2145,7 +2244,7 @@ void bch_cache_release(struct kobject *kobj)
|
||||
ca->set->cache[ca->sb.nr_this_dev] = NULL;
|
||||
}
|
||||
|
||||
free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
|
||||
free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
|
||||
kfree(ca->prio_buckets);
|
||||
vfree(ca->buckets);
|
||||
|
||||
@ -2242,7 +2341,7 @@ static int cache_alloc(struct cache *ca)
|
||||
goto err_prio_buckets_alloc;
|
||||
}
|
||||
|
||||
ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
|
||||
ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
|
||||
if (!ca->disk_buckets) {
|
||||
err = "ca->disk_buckets alloc failed";
|
||||
goto err_disk_buckets_alloc;
|
||||
@ -2789,7 +2888,7 @@ static int __init bcache_init(void)
|
||||
static const struct attribute *files[] = {
|
||||
&ksysfs_register.attr,
|
||||
&ksysfs_register_quiet.attr,
|
||||
#ifdef CONFIG_BCACHE_ASYNC_REGISTRAION
|
||||
#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
|
||||
&ksysfs_register_async.attr,
|
||||
#endif
|
||||
&ksysfs_pendings_cleanup.attr,
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "btree.h"
|
||||
#include "request.h"
|
||||
#include "writeback.h"
|
||||
#include "features.h"
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sort.h>
|
||||
@ -88,6 +89,9 @@ read_attribute(btree_used_percent);
|
||||
read_attribute(average_key_size);
|
||||
read_attribute(dirty_data);
|
||||
read_attribute(bset_tree_stats);
|
||||
read_attribute(feature_compat);
|
||||
read_attribute(feature_ro_compat);
|
||||
read_attribute(feature_incompat);
|
||||
|
||||
read_attribute(state);
|
||||
read_attribute(cache_read_races);
|
||||
@ -779,6 +783,13 @@ SHOW(__bch_cache_set)
|
||||
if (attr == &sysfs_bset_tree_stats)
|
||||
return bch_bset_print_stats(c, buf);
|
||||
|
||||
if (attr == &sysfs_feature_compat)
|
||||
return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE);
|
||||
if (attr == &sysfs_feature_ro_compat)
|
||||
return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE);
|
||||
if (attr == &sysfs_feature_incompat)
|
||||
return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
SHOW_LOCKED(bch_cache_set)
|
||||
@ -987,6 +998,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
|
||||
&sysfs_io_disable,
|
||||
&sysfs_cutoff_writeback,
|
||||
&sysfs_cutoff_writeback_sync,
|
||||
&sysfs_feature_compat,
|
||||
&sysfs_feature_ro_compat,
|
||||
&sysfs_feature_incompat,
|
||||
NULL
|
||||
};
|
||||
KTYPE(bch_cache_set_internal);
|
||||
|
@ -459,10 +459,8 @@ static void read_dirty(struct cached_dev *dc)
|
||||
for (i = 0; i < nk; i++) {
|
||||
w = keys[i];
|
||||
|
||||
io = kzalloc(sizeof(struct dirty_io) +
|
||||
sizeof(struct bio_vec) *
|
||||
DIV_ROUND_UP(KEY_SIZE(&w->key),
|
||||
PAGE_SECTORS),
|
||||
io = kzalloc(struct_size(io, bio.bi_inline_vecs,
|
||||
DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)),
|
||||
GFP_KERNEL);
|
||||
if (!io)
|
||||
goto err;
|
||||
@ -523,15 +521,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode,
|
||||
uint64_t offset, int nr_sectors)
|
||||
{
|
||||
struct bcache_device *d = c->devices[inode];
|
||||
unsigned int stripe_offset, stripe, sectors_dirty;
|
||||
unsigned int stripe_offset, sectors_dirty;
|
||||
int stripe;
|
||||
|
||||
if (!d)
|
||||
return;
|
||||
|
||||
stripe = offset_to_stripe(d, offset);
|
||||
if (stripe < 0)
|
||||
return;
|
||||
|
||||
if (UUID_FLASH_ONLY(&c->uuids[inode]))
|
||||
atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
|
||||
|
||||
stripe = offset_to_stripe(d, offset);
|
||||
stripe_offset = offset & (d->stripe_size - 1);
|
||||
|
||||
while (nr_sectors) {
|
||||
@ -571,12 +573,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
|
||||
static void refill_full_stripes(struct cached_dev *dc)
|
||||
{
|
||||
struct keybuf *buf = &dc->writeback_keys;
|
||||
unsigned int start_stripe, stripe, next_stripe;
|
||||
unsigned int start_stripe, next_stripe;
|
||||
int stripe;
|
||||
bool wrapped = false;
|
||||
|
||||
stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
|
||||
|
||||
if (stripe >= dc->disk.nr_stripes)
|
||||
if (stripe < 0)
|
||||
stripe = 0;
|
||||
|
||||
start_stripe = stripe;
|
||||
@ -825,10 +827,8 @@ static int bch_dirty_init_thread(void *arg)
|
||||
struct btree_iter iter;
|
||||
struct bkey *k, *p;
|
||||
int cur_idx, prev_idx, skip_nr;
|
||||
int i;
|
||||
|
||||
k = p = NULL;
|
||||
i = 0;
|
||||
cur_idx = prev_idx = 0;
|
||||
|
||||
bch_btree_iter_init(&c->root->keys, &iter, NULL);
|
||||
|
@ -52,10 +52,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned int offset_to_stripe(struct bcache_device *d,
|
||||
static inline int offset_to_stripe(struct bcache_device *d,
|
||||
uint64_t offset)
|
||||
{
|
||||
do_div(offset, d->stripe_size);
|
||||
|
||||
/* d->nr_stripes is in range [1, INT_MAX] */
|
||||
if (unlikely(offset >= d->nr_stripes)) {
|
||||
pr_err("Invalid stripe %llu (>= nr_stripes %d).\n",
|
||||
offset, d->nr_stripes);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Here offset is definitly smaller than INT_MAX,
|
||||
* return it as int will never overflow.
|
||||
*/
|
||||
return offset;
|
||||
}
|
||||
|
||||
@ -63,7 +75,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
|
||||
uint64_t offset,
|
||||
unsigned int nr_sectors)
|
||||
{
|
||||
unsigned int stripe = offset_to_stripe(&dc->disk, offset);
|
||||
int stripe = offset_to_stripe(&dc->disk, offset);
|
||||
|
||||
if (stripe < 0)
|
||||
return false;
|
||||
|
||||
while (1) {
|
||||
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
|
||||
|
@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
|
||||
s += blocks;
|
||||
}
|
||||
bitmap->last_end_sync = jiffies;
|
||||
sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
|
||||
}
|
||||
EXPORT_SYMBOL(md_bitmap_cond_end_sync);
|
||||
|
||||
|
@ -1518,6 +1518,7 @@ static void unlock_all_bitmaps(struct mddev *mddev)
|
||||
}
|
||||
}
|
||||
kfree(cinfo->other_bitmap_lockres);
|
||||
cinfo->other_bitmap_lockres = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
181
drivers/md/md.c
181
drivers/md/md.c
@ -101,6 +101,8 @@ static void mddev_detach(struct mddev *mddev);
|
||||
* count by 2 for every hour elapsed between read errors.
|
||||
*/
|
||||
#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
|
||||
/* Default safemode delay: 200 msec */
|
||||
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
|
||||
/*
|
||||
* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
|
||||
* is 1000 KB/sec, so the extra system load does not show up that much.
|
||||
@ -463,12 +465,38 @@ check_suspended:
|
||||
}
|
||||
EXPORT_SYMBOL(md_handle_request);
|
||||
|
||||
struct md_io {
|
||||
struct mddev *mddev;
|
||||
bio_end_io_t *orig_bi_end_io;
|
||||
void *orig_bi_private;
|
||||
unsigned long start_time;
|
||||
};
|
||||
|
||||
static void md_end_io(struct bio *bio)
|
||||
{
|
||||
struct md_io *md_io = bio->bi_private;
|
||||
struct mddev *mddev = md_io->mddev;
|
||||
|
||||
disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time);
|
||||
|
||||
bio->bi_end_io = md_io->orig_bi_end_io;
|
||||
bio->bi_private = md_io->orig_bi_private;
|
||||
|
||||
mempool_free(md_io, &mddev->md_io_pool);
|
||||
|
||||
if (bio->bi_end_io)
|
||||
bio->bi_end_io(bio);
|
||||
}
|
||||
|
||||
static blk_qc_t md_submit_bio(struct bio *bio)
|
||||
{
|
||||
const int rw = bio_data_dir(bio);
|
||||
const int sgrp = op_stat_group(bio_op(bio));
|
||||
struct mddev *mddev = bio->bi_disk->private_data;
|
||||
unsigned int sectors;
|
||||
|
||||
if (mddev == NULL || mddev->pers == NULL) {
|
||||
bio_io_error(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
|
||||
bio_io_error(bio);
|
||||
@ -477,10 +505,6 @@ static blk_qc_t md_submit_bio(struct bio *bio)
|
||||
|
||||
blk_queue_split(&bio);
|
||||
|
||||
if (mddev == NULL || mddev->pers == NULL) {
|
||||
bio_io_error(bio);
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
|
||||
if (bio_sectors(bio) != 0)
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
@ -488,21 +512,27 @@ static blk_qc_t md_submit_bio(struct bio *bio)
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
/*
|
||||
* save the sectors now since our bio can
|
||||
* go away inside make_request
|
||||
*/
|
||||
sectors = bio_sectors(bio);
|
||||
if (bio->bi_end_io != md_end_io) {
|
||||
struct md_io *md_io;
|
||||
|
||||
md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO);
|
||||
md_io->mddev = mddev;
|
||||
md_io->orig_bi_end_io = bio->bi_end_io;
|
||||
md_io->orig_bi_private = bio->bi_private;
|
||||
|
||||
bio->bi_end_io = md_end_io;
|
||||
bio->bi_private = md_io;
|
||||
|
||||
md_io->start_time = disk_start_io_acct(mddev->gendisk,
|
||||
bio_sectors(bio),
|
||||
bio_op(bio));
|
||||
}
|
||||
|
||||
/* bio could be mergeable after passing to underlayer */
|
||||
bio->bi_opf &= ~REQ_NOMERGE;
|
||||
|
||||
md_handle_request(mddev, bio);
|
||||
|
||||
part_stat_lock();
|
||||
part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
|
||||
part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
|
||||
part_stat_unlock();
|
||||
|
||||
return BLK_QC_T_NONE;
|
||||
}
|
||||
|
||||
@ -928,7 +958,8 @@ static void super_written(struct bio *bio)
|
||||
struct mddev *mddev = rdev->mddev;
|
||||
|
||||
if (bio->bi_status) {
|
||||
pr_err("md: super_written gets error=%d\n", bio->bi_status);
|
||||
pr_err("md: %s gets error=%d\n", __func__,
|
||||
blk_status_to_errno(bio->bi_status));
|
||||
md_error(mddev, rdev);
|
||||
if (!test_bit(Faulty, &rdev->flags)
|
||||
&& (bio->bi_opf & MD_FAILFAST)) {
|
||||
@ -2143,6 +2174,24 @@ retry:
|
||||
sb->sb_csum = calc_sb_1_csum(sb);
|
||||
}
|
||||
|
||||
static sector_t super_1_choose_bm_space(sector_t dev_size)
|
||||
{
|
||||
sector_t bm_space;
|
||||
|
||||
/* if the device is bigger than 8Gig, save 64k for bitmap
|
||||
* usage, if bigger than 200Gig, save 128k
|
||||
*/
|
||||
if (dev_size < 64*2)
|
||||
bm_space = 0;
|
||||
else if (dev_size - 64*2 >= 200*1024*1024*2)
|
||||
bm_space = 128*2;
|
||||
else if (dev_size - 4*2 > 8*1024*1024*2)
|
||||
bm_space = 64*2;
|
||||
else
|
||||
bm_space = 4*2;
|
||||
return bm_space;
|
||||
}
|
||||
|
||||
static unsigned long long
|
||||
super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
|
||||
{
|
||||
@ -2163,13 +2212,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
|
||||
return 0;
|
||||
} else {
|
||||
/* minor version 0; superblock after data */
|
||||
sector_t sb_start;
|
||||
sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
|
||||
sector_t sb_start, bm_space;
|
||||
sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
|
||||
|
||||
/* 8K is for superblock */
|
||||
sb_start = dev_size - 8*2;
|
||||
sb_start &= ~(sector_t)(4*2 - 1);
|
||||
max_sectors = rdev->sectors + sb_start - rdev->sb_start;
|
||||
|
||||
bm_space = super_1_choose_bm_space(dev_size);
|
||||
|
||||
/* Space that can be used to store date needs to decrease
|
||||
* superblock bitmap space and bad block space(4K)
|
||||
*/
|
||||
max_sectors = sb_start - bm_space - 4*2;
|
||||
|
||||
if (!num_sectors || num_sectors > max_sectors)
|
||||
num_sectors = max_sectors;
|
||||
rdev->sb_start = sb_start;
|
||||
}
|
||||
sb = page_address(rdev->sb_page);
|
||||
sb->data_size = cpu_to_le64(num_sectors);
|
||||
@ -2421,9 +2479,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
|
||||
goto fail;
|
||||
|
||||
ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
|
||||
if (sysfs_create_link(&rdev->kobj, ko, "block"))
|
||||
/* failure here is OK */;
|
||||
/* failure here is OK */
|
||||
err = sysfs_create_link(&rdev->kobj, ko, "block");
|
||||
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
|
||||
rdev->sysfs_unack_badblocks =
|
||||
sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks");
|
||||
rdev->sysfs_badblocks =
|
||||
sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
|
||||
|
||||
list_add_rcu(&rdev->same_set, &mddev->disks);
|
||||
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
|
||||
@ -2457,7 +2519,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
|
||||
rdev->mddev = NULL;
|
||||
sysfs_remove_link(&rdev->kobj, "block");
|
||||
sysfs_put(rdev->sysfs_state);
|
||||
sysfs_put(rdev->sysfs_unack_badblocks);
|
||||
sysfs_put(rdev->sysfs_badblocks);
|
||||
rdev->sysfs_state = NULL;
|
||||
rdev->sysfs_unack_badblocks = NULL;
|
||||
rdev->sysfs_badblocks = NULL;
|
||||
rdev->badblocks.count = 0;
|
||||
/* We need to delay this, otherwise we can deadlock when
|
||||
* writing to 'remove' to "dev/state". We also need
|
||||
@ -2802,7 +2868,7 @@ rewrite:
|
||||
goto repeat;
|
||||
wake_up(&mddev->sb_wait);
|
||||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (test_and_clear_bit(FaultRecorded, &rdev->flags))
|
||||
@ -3182,8 +3248,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
return err;
|
||||
} else
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||
if (sysfs_link_rdev(rdev->mddev, rdev))
|
||||
/* failure here is OK */;
|
||||
/* failure here is OK */;
|
||||
sysfs_link_rdev(rdev->mddev, rdev);
|
||||
/* don't wakeup anyone, leave that to userspace. */
|
||||
} else {
|
||||
if (slot >= rdev->mddev->raid_disks &&
|
||||
@ -4055,7 +4121,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
|
||||
mddev_resume(mddev);
|
||||
if (!mddev->thread)
|
||||
md_update_sb(mddev, 1);
|
||||
sysfs_notify(&mddev->kobj, NULL, "level");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_level);
|
||||
md_new_event(mddev);
|
||||
rv = len;
|
||||
out_unlock:
|
||||
@ -4167,6 +4233,14 @@ out_unlock:
|
||||
static struct md_sysfs_entry md_raid_disks =
|
||||
__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
|
||||
|
||||
static ssize_t
|
||||
uuid_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
return sprintf(page, "%pU\n", mddev->uuid);
|
||||
}
|
||||
static struct md_sysfs_entry md_uuid =
|
||||
__ATTR(uuid, S_IRUGO, uuid_show, NULL);
|
||||
|
||||
static ssize_t
|
||||
chunk_size_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
@ -4808,7 +4882,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
|
||||
}
|
||||
if (err)
|
||||
return err;
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
} else {
|
||||
if (cmd_match(page, "check"))
|
||||
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
|
||||
@ -5423,6 +5497,7 @@ static struct attribute *md_default_attrs[] = {
|
||||
&md_level.attr,
|
||||
&md_layout.attr,
|
||||
&md_raid_disks.attr,
|
||||
&md_uuid.attr,
|
||||
&md_chunk_size.attr,
|
||||
&md_size.attr,
|
||||
&md_resync_start.attr,
|
||||
@ -5514,6 +5589,13 @@ static void md_free(struct kobject *ko)
|
||||
|
||||
if (mddev->sysfs_state)
|
||||
sysfs_put(mddev->sysfs_state);
|
||||
if (mddev->sysfs_completed)
|
||||
sysfs_put(mddev->sysfs_completed);
|
||||
if (mddev->sysfs_degraded)
|
||||
sysfs_put(mddev->sysfs_degraded);
|
||||
if (mddev->sysfs_level)
|
||||
sysfs_put(mddev->sysfs_level);
|
||||
|
||||
|
||||
if (mddev->gendisk)
|
||||
del_gendisk(mddev->gendisk);
|
||||
@ -5525,6 +5607,7 @@ static void md_free(struct kobject *ko)
|
||||
|
||||
bioset_exit(&mddev->bio_set);
|
||||
bioset_exit(&mddev->sync_set);
|
||||
mempool_exit(&mddev->md_io_pool);
|
||||
kfree(mddev);
|
||||
}
|
||||
|
||||
@ -5620,6 +5703,11 @@ static int md_alloc(dev_t dev, char *name)
|
||||
*/
|
||||
mddev->hold_active = UNTIL_STOP;
|
||||
|
||||
error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE,
|
||||
sizeof(struct md_io));
|
||||
if (error)
|
||||
goto abort;
|
||||
|
||||
error = -ENOMEM;
|
||||
mddev->queue = blk_alloc_queue(NUMA_NO_NODE);
|
||||
if (!mddev->queue)
|
||||
@ -5676,6 +5764,9 @@ static int md_alloc(dev_t dev, char *name)
|
||||
if (!error && mddev->kobj.sd) {
|
||||
kobject_uevent(&mddev->kobj, KOBJ_ADD);
|
||||
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
|
||||
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
|
||||
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
|
||||
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level");
|
||||
}
|
||||
mddev_put(mddev);
|
||||
return error;
|
||||
@ -5961,7 +6052,7 @@ int md_run(struct mddev *mddev)
|
||||
if (mddev_is_clustered(mddev))
|
||||
mddev->safemode_delay = 0;
|
||||
else
|
||||
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
|
||||
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
|
||||
mddev->in_sync = 1;
|
||||
smp_wmb();
|
||||
spin_lock(&mddev->lock);
|
||||
@ -6028,7 +6119,7 @@ static int do_md_run(struct mddev *mddev)
|
||||
kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_action);
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
out:
|
||||
clear_bit(MD_NOT_READY, &mddev->flags);
|
||||
return err;
|
||||
@ -7339,6 +7430,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
|
||||
|
||||
mddev->bitmap_info.nodes = 0;
|
||||
md_cluster_ops->leave(mddev);
|
||||
module_put(md_cluster_mod);
|
||||
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
|
||||
}
|
||||
mddev_suspend(mddev);
|
||||
md_bitmap_destroy(mddev);
|
||||
@ -8330,6 +8423,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
|
||||
|
||||
int md_setup_cluster(struct mddev *mddev, int nodes)
|
||||
{
|
||||
int ret;
|
||||
if (!md_cluster_ops)
|
||||
request_module("md-cluster");
|
||||
spin_lock(&pers_lock);
|
||||
@ -8341,7 +8435,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
|
||||
}
|
||||
spin_unlock(&pers_lock);
|
||||
|
||||
return md_cluster_ops->join(mddev, nodes);
|
||||
ret = md_cluster_ops->join(mddev, nodes);
|
||||
if (!ret)
|
||||
mddev->safemode_delay = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void md_cluster_stop(struct mddev *mddev)
|
||||
@ -8742,7 +8839,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
} else
|
||||
mddev->curr_resync = 3; /* no longer delayed */
|
||||
mddev->curr_resync_completed = j;
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
md_new_event(mddev);
|
||||
update_time = jiffies;
|
||||
|
||||
@ -8770,7 +8867,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
mddev->recovery_cp = j;
|
||||
update_time = jiffies;
|
||||
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
}
|
||||
|
||||
while (j >= mddev->resync_max &&
|
||||
@ -8877,7 +8974,7 @@ void md_do_sync(struct md_thread *thread)
|
||||
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
mddev->curr_resync > 3) {
|
||||
mddev->curr_resync_completed = mddev->curr_resync;
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
}
|
||||
mddev->pers->sync_request(mddev, max_sectors, &skipped);
|
||||
|
||||
@ -9007,7 +9104,7 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
}
|
||||
|
||||
if (removed && mddev->kobj.sd)
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
|
||||
if (this && removed)
|
||||
goto no_add;
|
||||
@ -9035,8 +9132,8 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
rdev->recovery_offset = 0;
|
||||
}
|
||||
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
|
||||
if (sysfs_link_rdev(mddev, rdev))
|
||||
/* failure here is OK */;
|
||||
/* failure here is OK */
|
||||
sysfs_link_rdev(mddev, rdev);
|
||||
if (!test_bit(Journal, &rdev->flags))
|
||||
spares++;
|
||||
md_new_event(mddev);
|
||||
@ -9290,8 +9387,7 @@ void md_reap_sync_thread(struct mddev *mddev)
|
||||
/* success...*/
|
||||
/* activate any spares */
|
||||
if (mddev->pers->spare_active(mddev)) {
|
||||
sysfs_notify(&mddev->kobj, NULL,
|
||||
"degraded");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
|
||||
}
|
||||
}
|
||||
@ -9381,8 +9477,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
if (rv == 0) {
|
||||
/* Make sure they get written out promptly */
|
||||
if (test_bit(ExternalBbl, &rdev->flags))
|
||||
sysfs_notify(&rdev->kobj, NULL,
|
||||
"unacknowledged_bad_blocks");
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks);
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_state);
|
||||
set_mask_bits(&mddev->sb_flags, 0,
|
||||
BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
|
||||
@ -9403,7 +9498,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
s += rdev->data_offset;
|
||||
rv = badblocks_clear(&rdev->badblocks, s, sectors);
|
||||
if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
|
||||
sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
|
||||
sysfs_notify_dirent_safe(rdev->sysfs_badblocks);
|
||||
return rv;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
|
||||
@ -9633,7 +9728,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
|
||||
if (rdev->recovery_offset == MaxSector &&
|
||||
!test_bit(In_sync, &rdev->flags) &&
|
||||
mddev->pers->spare_active(mddev))
|
||||
sysfs_notify(&mddev->kobj, NULL, "degraded");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
|
||||
put_page(swapout);
|
||||
return 0;
|
||||
|
@ -126,7 +126,10 @@ struct md_rdev {
|
||||
|
||||
struct kernfs_node *sysfs_state; /* handle for 'state'
|
||||
* sysfs entry */
|
||||
|
||||
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */
|
||||
struct kernfs_node *sysfs_unack_badblocks;
|
||||
/* handle for 'bad_blocks' sysfs dentry */
|
||||
struct kernfs_node *sysfs_badblocks;
|
||||
struct badblocks badblocks;
|
||||
|
||||
struct {
|
||||
@ -420,6 +423,9 @@ struct mddev {
|
||||
* file in sysfs.
|
||||
*/
|
||||
struct kernfs_node *sysfs_action; /* handle for 'sync_action' */
|
||||
struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */
|
||||
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
|
||||
struct kernfs_node *sysfs_level; /*handle for 'level' */
|
||||
|
||||
struct work_struct del_work; /* used for delayed sysfs removal */
|
||||
|
||||
@ -481,6 +487,7 @@ struct mddev {
|
||||
struct bio_set sync_set; /* for sync operations like
|
||||
* metadata and bitmap writes
|
||||
*/
|
||||
mempool_t md_io_pool;
|
||||
|
||||
/* Generic flush handling.
|
||||
* The last to finish preflush schedules a worker to submit
|
||||
|
@ -955,6 +955,7 @@ static void wait_barrier(struct r10conf *conf)
|
||||
{
|
||||
spin_lock_irq(&conf->resync_lock);
|
||||
if (conf->barrier) {
|
||||
struct bio_list *bio_list = current->bio_list;
|
||||
conf->nr_waiting++;
|
||||
/* Wait for the barrier to drop.
|
||||
* However if there are already pending
|
||||
@ -969,9 +970,16 @@ static void wait_barrier(struct r10conf *conf)
|
||||
wait_event_lock_irq(conf->wait_barrier,
|
||||
!conf->barrier ||
|
||||
(atomic_read(&conf->nr_pending) &&
|
||||
current->bio_list &&
|
||||
(!bio_list_empty(¤t->bio_list[0]) ||
|
||||
!bio_list_empty(¤t->bio_list[1]))),
|
||||
bio_list &&
|
||||
(!bio_list_empty(&bio_list[0]) ||
|
||||
!bio_list_empty(&bio_list[1]))) ||
|
||||
/* move on if recovery thread is
|
||||
* blocked by us
|
||||
*/
|
||||
(conf->mddev->thread->tsk == current &&
|
||||
test_bit(MD_RECOVERY_RUNNING,
|
||||
&conf->mddev->recovery) &&
|
||||
conf->nr_queued > 0),
|
||||
conf->resync_lock);
|
||||
conf->nr_waiting--;
|
||||
if (!conf->nr_waiting)
|
||||
@ -4282,8 +4290,8 @@ out:
|
||||
else
|
||||
rdev->recovery_offset = 0;
|
||||
|
||||
if (sysfs_link_rdev(mddev, rdev))
|
||||
/* Failure here is OK */;
|
||||
/* Failure here is OK */
|
||||
sysfs_link_rdev(mddev, rdev);
|
||||
}
|
||||
} else if (rdev->raid_disk >= conf->prev.raid_disks
|
||||
&& !test_bit(Faulty, &rdev->flags)) {
|
||||
@ -4429,7 +4437,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
|
||||
sector_nr = conf->reshape_progress;
|
||||
if (sector_nr) {
|
||||
mddev->curr_resync_completed = sector_nr;
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
*skipped = 1;
|
||||
return sector_nr;
|
||||
}
|
||||
|
@ -195,9 +195,7 @@ struct r5l_log {
|
||||
static inline sector_t r5c_tree_index(struct r5conf *conf,
|
||||
sector_t sect)
|
||||
{
|
||||
sector_t offset;
|
||||
|
||||
offset = sector_div(sect, conf->chunk_sectors);
|
||||
sector_div(sect, conf->chunk_sectors);
|
||||
return sect;
|
||||
}
|
||||
|
||||
@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
|
||||
wbi = dev->written;
|
||||
dev->written = NULL;
|
||||
while (wbi && wbi->bi_iter.bi_sector <
|
||||
dev->sector + STRIPE_SECTORS) {
|
||||
wbi2 = r5_next_bio(wbi, dev->sector);
|
||||
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
wbi2 = r5_next_bio(conf, wbi, dev->sector);
|
||||
md_write_end(conf->mddev);
|
||||
bio_endio(wbi);
|
||||
wbi = wbi2;
|
||||
@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
|
||||
set_bit(R5_UPTODATE, &sh->dev[i].flags);
|
||||
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
|
||||
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
!test_bit(STRIPE_DEGRADED, &sh->state),
|
||||
0);
|
||||
}
|
||||
@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
|
||||
*/
|
||||
if (atomic_read(&conf->r5c_cached_full_stripes) >=
|
||||
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
|
||||
conf->chunk_sectors >> STRIPE_SHIFT))
|
||||
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
|
||||
r5l_wake_reclaim(conf->log, 0);
|
||||
}
|
||||
|
||||
@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
|
||||
struct mddev *mddev = log->rdev->mddev;
|
||||
struct r5conf *conf = mddev->private;
|
||||
struct stripe_head *sh, *next;
|
||||
bool cleared_pending = false;
|
||||
|
||||
if (ctx->data_only_stripes == 0)
|
||||
return;
|
||||
|
||||
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
|
||||
cleared_pending = true;
|
||||
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
|
||||
}
|
||||
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
|
||||
|
||||
list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
|
||||
@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
|
||||
atomic_read(&conf->active_stripes) == 0);
|
||||
|
||||
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
|
||||
if (cleared_pending)
|
||||
set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
|
||||
}
|
||||
|
||||
static int r5l_recovery_log(struct r5l_log *log)
|
||||
@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
|
||||
struct r5conf *conf;
|
||||
int ret;
|
||||
|
||||
ret = mddev_lock(mddev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock(&mddev->lock);
|
||||
conf = mddev->private;
|
||||
if (!conf || !conf->log) {
|
||||
mddev_unlock(mddev);
|
||||
spin_unlock(&mddev->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
|
||||
default:
|
||||
ret = 0;
|
||||
}
|
||||
mddev_unlock(mddev);
|
||||
spin_unlock(&mddev->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
|
||||
* be just after the last logged stripe and write to the same
|
||||
* disks. Use bit shift and logarithm to avoid 64-bit division.
|
||||
*/
|
||||
if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
|
||||
if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) &&
|
||||
(data_sector >> ilog2(conf->chunk_sectors) ==
|
||||
data_sector_last >> ilog2(conf->chunk_sectors)) &&
|
||||
((data_sector - data_sector_last) * data_disks ==
|
||||
@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
|
||||
|
||||
/* if start and end is 4k aligned, use a 4k block */
|
||||
if (block_size == 512 &&
|
||||
(r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
|
||||
(r_sector_last & (STRIPE_SECTORS - 1)) == 0)
|
||||
block_size = STRIPE_SIZE;
|
||||
(r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 &&
|
||||
(r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0)
|
||||
block_size = RAID5_STRIPE_SIZE(conf);
|
||||
|
||||
/* iterate through blocks in strip */
|
||||
for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
|
||||
@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
|
||||
ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
|
||||
|
||||
if (ppl_data_sectors > 0)
|
||||
ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
|
||||
ppl_data_sectors = rounddown(ppl_data_sectors,
|
||||
RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private));
|
||||
|
||||
if (ppl_data_sectors <= 0) {
|
||||
pr_warn("md/raid:%s: PPL space too small on %s\n",
|
||||
|
@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq;
|
||||
|
||||
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
|
||||
{
|
||||
int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
|
||||
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
|
||||
return &conf->stripe_hashtbl[hash];
|
||||
}
|
||||
|
||||
static inline int stripe_hash_locks_hash(sector_t sect)
|
||||
static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
|
||||
{
|
||||
return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
|
||||
return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK;
|
||||
}
|
||||
|
||||
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
|
||||
@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
|
||||
int previous, int noblock, int noquiesce)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
int hash = stripe_hash_locks_hash(sector);
|
||||
int hash = stripe_hash_locks_hash(conf, sector);
|
||||
int inc_empty_inactive_list_flag;
|
||||
|
||||
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
|
||||
@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
|
||||
tmp_sec = sh->sector;
|
||||
if (!sector_div(tmp_sec, conf->chunk_sectors))
|
||||
return;
|
||||
head_sector = sh->sector - STRIPE_SECTORS;
|
||||
head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf);
|
||||
|
||||
hash = stripe_hash_locks_hash(head_sector);
|
||||
hash = stripe_hash_locks_hash(conf, head_sector);
|
||||
spin_lock_irq(conf->hash_locks + hash);
|
||||
head = __find_stripe(conf, head_sector, conf->generation);
|
||||
if (head && !atomic_inc_not_zero(&head->count)) {
|
||||
@ -1057,7 +1057,7 @@ again:
|
||||
test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
|
||||
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors);
|
||||
if (!bad)
|
||||
break;
|
||||
@ -1089,7 +1089,7 @@ again:
|
||||
if (rdev) {
|
||||
if (s->syncing || s->expanding || s->expanded
|
||||
|| s->replacing)
|
||||
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
|
||||
md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
|
||||
|
||||
set_bit(STRIPE_IO_STARTED, &sh->state);
|
||||
|
||||
@ -1129,9 +1129,9 @@ again:
|
||||
else
|
||||
sh->dev[i].vec.bv_page = sh->dev[i].page;
|
||||
bi->bi_vcnt = 1;
|
||||
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
|
||||
bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
|
||||
bi->bi_io_vec[0].bv_offset = 0;
|
||||
bi->bi_iter.bi_size = STRIPE_SIZE;
|
||||
bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
|
||||
bi->bi_write_hint = sh->dev[i].write_hint;
|
||||
if (!rrdev)
|
||||
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
|
||||
@ -1156,7 +1156,7 @@ again:
|
||||
if (rrdev) {
|
||||
if (s->syncing || s->expanding || s->expanded
|
||||
|| s->replacing)
|
||||
md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
|
||||
md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
|
||||
|
||||
set_bit(STRIPE_IO_STARTED, &sh->state);
|
||||
|
||||
@ -1183,9 +1183,9 @@ again:
|
||||
WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
|
||||
sh->dev[i].rvec.bv_page = sh->dev[i].page;
|
||||
rbi->bi_vcnt = 1;
|
||||
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
|
||||
rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf);
|
||||
rbi->bi_io_vec[0].bv_offset = 0;
|
||||
rbi->bi_iter.bi_size = STRIPE_SIZE;
|
||||
rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf);
|
||||
rbi->bi_write_hint = sh->dev[i].write_hint;
|
||||
sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET;
|
||||
/*
|
||||
@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
|
||||
int page_offset;
|
||||
struct async_submit_ctl submit;
|
||||
enum async_tx_flags flags = 0;
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
|
||||
if (bio->bi_iter.bi_sector >= sector)
|
||||
page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
|
||||
@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
|
||||
len -= b_offset;
|
||||
}
|
||||
|
||||
if (len > 0 && page_offset + len > STRIPE_SIZE)
|
||||
clen = STRIPE_SIZE - page_offset;
|
||||
if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf))
|
||||
clen = RAID5_STRIPE_SIZE(conf) - page_offset;
|
||||
else
|
||||
clen = len;
|
||||
|
||||
@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
|
||||
b_offset += bvl.bv_offset;
|
||||
bio_page = bvl.bv_page;
|
||||
if (frombio) {
|
||||
if (sh->raid_conf->skip_copy &&
|
||||
if (conf->skip_copy &&
|
||||
b_offset == 0 && page_offset == 0 &&
|
||||
clen == STRIPE_SIZE &&
|
||||
clen == RAID5_STRIPE_SIZE(conf) &&
|
||||
!no_skipcopy)
|
||||
*page = bio_page;
|
||||
else
|
||||
@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
|
||||
{
|
||||
struct stripe_head *sh = stripe_head_ref;
|
||||
int i;
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
|
||||
pr_debug("%s: stripe %llu\n", __func__,
|
||||
(unsigned long long)sh->sector);
|
||||
@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref)
|
||||
rbi = dev->read;
|
||||
dev->read = NULL;
|
||||
while (rbi && rbi->bi_iter.bi_sector <
|
||||
dev->sector + STRIPE_SECTORS) {
|
||||
rbi2 = r5_next_bio(rbi, dev->sector);
|
||||
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
rbi2 = r5_next_bio(conf, rbi, dev->sector);
|
||||
bio_endio(rbi);
|
||||
rbi = rbi2;
|
||||
}
|
||||
@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh)
|
||||
struct dma_async_tx_descriptor *tx = NULL;
|
||||
struct async_submit_ctl submit;
|
||||
int i;
|
||||
struct r5conf *conf = sh->raid_conf;
|
||||
|
||||
BUG_ON(sh->batch_head);
|
||||
pr_debug("%s: stripe %llu\n", __func__,
|
||||
@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh)
|
||||
dev->toread = NULL;
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
while (rbi && rbi->bi_iter.bi_sector <
|
||||
dev->sector + STRIPE_SECTORS) {
|
||||
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
tx = async_copy_data(0, rbi, &dev->page,
|
||||
dev->sector, tx, sh, 0);
|
||||
rbi = r5_next_bio(rbi, dev->sector);
|
||||
rbi = r5_next_bio(conf, rbi, dev->sector);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
|
||||
ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
|
||||
if (unlikely(count == 1))
|
||||
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
|
||||
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
else
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
|
||||
return tx;
|
||||
}
|
||||
@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
||||
ops_complete_compute, sh,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
||||
tx = async_gen_syndrome(blocks, 0, count+2,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
} else {
|
||||
/* Compute any data- or p-drive using XOR */
|
||||
count = 0;
|
||||
@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
|
||||
NULL, ops_complete_compute, sh,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
|
||||
tx = async_xor(dest, blocks, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
}
|
||||
|
||||
return tx;
|
||||
@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
ops_complete_compute, sh,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
return async_gen_syndrome(blocks, 0, syndrome_disks+2,
|
||||
STRIPE_SIZE, &submit);
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
&submit);
|
||||
} else {
|
||||
struct page *dest;
|
||||
int data_target;
|
||||
@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
|
||||
NULL, NULL, NULL,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
|
||||
tx = async_xor(dest, blocks, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
&submit);
|
||||
|
||||
count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
|
||||
@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
ops_complete_compute, sh,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
return async_gen_syndrome(blocks, 0, count+2,
|
||||
STRIPE_SIZE, &submit);
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
&submit);
|
||||
}
|
||||
} else {
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
|
||||
@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
if (failb == syndrome_disks) {
|
||||
/* We're missing D+P. */
|
||||
return async_raid6_datap_recov(syndrome_disks+2,
|
||||
STRIPE_SIZE, faila,
|
||||
blocks, &submit);
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
faila,
|
||||
blocks, &submit);
|
||||
} else {
|
||||
/* We're missing D+D. */
|
||||
return async_raid6_2data_recov(syndrome_disks+2,
|
||||
STRIPE_SIZE, faila, failb,
|
||||
blocks, &submit);
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
faila, failb,
|
||||
blocks, &submit);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
|
||||
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
|
||||
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
|
||||
return tx;
|
||||
}
|
||||
@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
|
||||
|
||||
init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
|
||||
ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
|
||||
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
||||
tx = async_gen_syndrome(blocks, 0, count+2,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
|
||||
return tx;
|
||||
}
|
||||
@ -1752,7 +1766,7 @@ again:
|
||||
WARN_ON(dev->page != dev->orig_page);
|
||||
|
||||
while (wbi && wbi->bi_iter.bi_sector <
|
||||
dev->sector + STRIPE_SECTORS) {
|
||||
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
if (wbi->bi_opf & REQ_FUA)
|
||||
set_bit(R5_WantFUA, &dev->flags);
|
||||
if (wbi->bi_opf & REQ_SYNC)
|
||||
@ -1770,7 +1784,7 @@ again:
|
||||
clear_bit(R5_OVERWRITE, &dev->flags);
|
||||
}
|
||||
}
|
||||
wbi = r5_next_bio(wbi, dev->sector);
|
||||
wbi = r5_next_bio(conf, wbi, dev->sector);
|
||||
}
|
||||
|
||||
if (head_sh->batch_head) {
|
||||
@ -1910,9 +1924,11 @@ again:
|
||||
}
|
||||
|
||||
if (unlikely(count == 1))
|
||||
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
|
||||
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
else
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
|
||||
tx = async_xor(xor_dest, xor_srcs, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
if (!last_stripe) {
|
||||
j++;
|
||||
sh = list_first_entry(&sh->batch_list, struct stripe_head,
|
||||
@ -1972,7 +1988,8 @@ again:
|
||||
} else
|
||||
init_async_submit(&submit, 0, tx, NULL, NULL,
|
||||
to_addr_conv(sh, percpu, j));
|
||||
tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
|
||||
tx = async_gen_syndrome(blocks, 0, count+2,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf), &submit);
|
||||
if (!last_stripe) {
|
||||
j++;
|
||||
sh = list_first_entry(&sh->batch_list, struct stripe_head,
|
||||
@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
|
||||
|
||||
init_async_submit(&submit, 0, NULL, NULL, NULL,
|
||||
to_addr_conv(sh, percpu, 0));
|
||||
tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
|
||||
tx = async_xor_val(xor_dest, xor_srcs, 0, count,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
&sh->ops.zero_sum_result, &submit);
|
||||
|
||||
atomic_inc(&sh->count);
|
||||
@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
|
||||
atomic_inc(&sh->count);
|
||||
init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
|
||||
sh, to_addr_conv(sh, percpu, 0));
|
||||
async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
|
||||
async_syndrome_val(srcs, 0, count+2,
|
||||
RAID5_STRIPE_SIZE(sh->raid_conf),
|
||||
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
|
||||
}
|
||||
|
||||
@ -2217,9 +2236,9 @@ static int grow_stripes(struct r5conf *conf, int num)
|
||||
/**
|
||||
* scribble_alloc - allocate percpu scribble buffer for required size
|
||||
* of the scribble region
|
||||
* @percpu - from for_each_present_cpu() of the caller
|
||||
* @num - total number of disks in the array
|
||||
* @cnt - scribble objs count for required size of the scribble region
|
||||
* @percpu: from for_each_present_cpu() of the caller
|
||||
* @num: total number of disks in the array
|
||||
* @cnt: scribble objs count for required size of the scribble region
|
||||
*
|
||||
* The scribble buffer size must be enough to contain:
|
||||
* 1/ a struct page pointer for each device in the array +2
|
||||
@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
|
||||
|
||||
percpu = per_cpu_ptr(conf->percpu, cpu);
|
||||
err = scribble_alloc(percpu, new_disks,
|
||||
new_sectors / STRIPE_SECTORS);
|
||||
new_sectors / RAID5_STRIPE_SECTORS(conf));
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi)
|
||||
*/
|
||||
pr_info_ratelimited(
|
||||
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
|
||||
mdname(conf->mddev), STRIPE_SECTORS,
|
||||
mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf),
|
||||
(unsigned long long)s,
|
||||
bdevname(rdev->bdev, b));
|
||||
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
|
||||
atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors);
|
||||
clear_bit(R5_ReadError, &sh->dev[i].flags);
|
||||
clear_bit(R5_ReWrite, &sh->dev[i].flags);
|
||||
} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
|
||||
@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi)
|
||||
if (!(set_bad
|
||||
&& test_bit(In_sync, &rdev->flags)
|
||||
&& rdev_set_badblocks(
|
||||
rdev, sh->sector, STRIPE_SECTORS, 0)))
|
||||
rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
|
||||
md_error(conf->mddev, rdev);
|
||||
}
|
||||
}
|
||||
@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
if (bi->bi_status)
|
||||
md_error(conf->mddev, rdev);
|
||||
else if (is_badblock(rdev, sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors))
|
||||
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
|
||||
} else {
|
||||
@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
set_bit(MD_RECOVERY_NEEDED,
|
||||
&rdev->mddev->recovery);
|
||||
} else if (is_badblock(rdev, sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors)) {
|
||||
set_bit(R5_MadeGood, &sh->dev[i].flags);
|
||||
if (test_bit(R5_ReadError, &sh->dev[i].flags))
|
||||
@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
|
||||
/* check if page is covered */
|
||||
sector_t sector = sh->dev[dd_idx].sector;
|
||||
for (bi=sh->dev[dd_idx].towrite;
|
||||
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
|
||||
sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) &&
|
||||
bi && bi->bi_iter.bi_sector <= sector;
|
||||
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
|
||||
bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) {
|
||||
if (bio_end_sector(bi) >= sector)
|
||||
sector = bio_end_sector(bi);
|
||||
}
|
||||
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
|
||||
if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf))
|
||||
if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
|
||||
sh->overwrite_disks++;
|
||||
}
|
||||
@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
|
||||
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
|
||||
spin_unlock_irq(&sh->stripe_lock);
|
||||
md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS, 0);
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
spin_lock_irq(&sh->stripe_lock);
|
||||
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
|
||||
if (!sh->batch_head) {
|
||||
@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
if (!rdev_set_badblocks(
|
||||
rdev,
|
||||
sh->sector,
|
||||
STRIPE_SECTORS, 0))
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
|
||||
while (bi && bi->bi_iter.bi_sector <
|
||||
sh->dev[i].sector + STRIPE_SECTORS) {
|
||||
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
|
||||
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector);
|
||||
|
||||
md_write_end(conf->mddev);
|
||||
bio_io_error(bi);
|
||||
@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
}
|
||||
if (bitmap_end)
|
||||
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS, 0, 0);
|
||||
RAID5_STRIPE_SECTORS(conf), 0, 0);
|
||||
bitmap_end = 0;
|
||||
/* and fail all 'written' */
|
||||
bi = sh->dev[i].written;
|
||||
@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
|
||||
if (bi) bitmap_end = 1;
|
||||
while (bi && bi->bi_iter.bi_sector <
|
||||
sh->dev[i].sector + STRIPE_SECTORS) {
|
||||
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
|
||||
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
|
||||
|
||||
md_write_end(conf->mddev);
|
||||
bio_io_error(bi);
|
||||
@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
if (bi)
|
||||
s->to_read--;
|
||||
while (bi && bi->bi_iter.bi_sector <
|
||||
sh->dev[i].sector + STRIPE_SECTORS) {
|
||||
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
struct bio *nextbi =
|
||||
r5_next_bio(bi, sh->dev[i].sector);
|
||||
r5_next_bio(conf, bi, sh->dev[i].sector);
|
||||
|
||||
bio_io_error(bi);
|
||||
bi = nextbi;
|
||||
@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
}
|
||||
if (bitmap_end)
|
||||
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS, 0, 0);
|
||||
RAID5_STRIPE_SECTORS(conf), 0, 0);
|
||||
/* If we were in the middle of a write the parity block might
|
||||
* still be locked - so just clear all R5_LOCKED flags
|
||||
*/
|
||||
@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
|
||||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
&& !rdev_set_badblocks(rdev, sh->sector,
|
||||
STRIPE_SECTORS, 0))
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
abort = 1;
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
if (rdev
|
||||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
&& !rdev_set_badblocks(rdev, sh->sector,
|
||||
STRIPE_SECTORS, 0))
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
abort = 1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
|
||||
conf->recovery_disabled =
|
||||
conf->mddev->recovery_disabled;
|
||||
}
|
||||
md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
|
||||
}
|
||||
|
||||
static int want_replace(struct stripe_head *sh, int disk_idx)
|
||||
@ -3538,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
|
||||
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
|
||||
&sh->dev[s->failed_num[1]] };
|
||||
int i;
|
||||
bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
|
||||
|
||||
|
||||
if (test_bit(R5_LOCKED, &dev->flags) ||
|
||||
@ -3596,17 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
|
||||
* devices must be read.
|
||||
*/
|
||||
return 1;
|
||||
|
||||
if (s->failed >= 2 &&
|
||||
(fdev[i]->towrite ||
|
||||
s->failed_num[i] == sh->pd_idx ||
|
||||
s->failed_num[i] == sh->qd_idx) &&
|
||||
!test_bit(R5_UPTODATE, &fdev[i]->flags))
|
||||
/* In max degraded raid6, If the failed disk is P, Q,
|
||||
* or we want to read the failed disk, we need to do
|
||||
* reconstruct-write.
|
||||
*/
|
||||
force_rcw = true;
|
||||
}
|
||||
|
||||
/* If we are forced to do a reconstruct-write, either because
|
||||
* the current RAID6 implementation only supports that, or
|
||||
* because parity cannot be trusted and we are currently
|
||||
* recovering it, there is extra need to be careful.
|
||||
/* If we are forced to do a reconstruct-write, because parity
|
||||
* cannot be trusted and we are currently recovering it, there
|
||||
* is extra need to be careful.
|
||||
* If one of the devices that we would need to read, because
|
||||
* it is not being overwritten (and maybe not written at all)
|
||||
* is missing/faulty, then we need to read everything we can.
|
||||
*/
|
||||
if (sh->raid_conf->level != 6 &&
|
||||
if (!force_rcw &&
|
||||
sh->sector < sh->raid_conf->mddev->recovery_cp)
|
||||
/* reconstruct-write isn't being forced */
|
||||
return 0;
|
||||
@ -3710,7 +3740,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
/*
|
||||
* handle_stripe_fill - read or compute data to satisfy pending requests.
|
||||
*/
|
||||
static void handle_stripe_fill(struct stripe_head *sh,
|
||||
@ -3785,14 +3815,14 @@ returnbi:
|
||||
wbi = dev->written;
|
||||
dev->written = NULL;
|
||||
while (wbi && wbi->bi_iter.bi_sector <
|
||||
dev->sector + STRIPE_SECTORS) {
|
||||
wbi2 = r5_next_bio(wbi, dev->sector);
|
||||
dev->sector + RAID5_STRIPE_SECTORS(conf)) {
|
||||
wbi2 = r5_next_bio(conf, wbi, dev->sector);
|
||||
md_write_end(conf->mddev);
|
||||
bio_endio(wbi);
|
||||
wbi = wbi2;
|
||||
}
|
||||
md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
!test_bit(STRIPE_DEGRADED, &sh->state),
|
||||
0);
|
||||
if (head_sh->batch_head) {
|
||||
@ -3976,10 +4006,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
|
||||
set_bit(R5_LOCKED, &dev->flags);
|
||||
set_bit(R5_Wantread, &dev->flags);
|
||||
s->locked++;
|
||||
} else {
|
||||
} else
|
||||
set_bit(STRIPE_DELAYED, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4004,10 +4032,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
|
||||
set_bit(R5_Wantread, &dev->flags);
|
||||
s->locked++;
|
||||
qread++;
|
||||
} else {
|
||||
} else
|
||||
set_bit(STRIPE_DELAYED, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (rcw && conf->mddev->queue)
|
||||
@ -4099,7 +4125,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
|
||||
*/
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
else {
|
||||
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
|
||||
atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
|
||||
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
|
||||
/* don't try to repair!! */
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
@ -4107,7 +4133,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
|
||||
"%llu-%llu\n", mdname(conf->mddev),
|
||||
(unsigned long long) sh->sector,
|
||||
(unsigned long long) sh->sector +
|
||||
STRIPE_SECTORS);
|
||||
RAID5_STRIPE_SECTORS(conf));
|
||||
} else {
|
||||
sh->check_state = check_state_compute_run;
|
||||
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
|
||||
@ -4264,7 +4290,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
|
||||
*/
|
||||
}
|
||||
} else {
|
||||
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
|
||||
atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches);
|
||||
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
|
||||
/* don't try to repair!! */
|
||||
set_bit(STRIPE_INSYNC, &sh->state);
|
||||
@ -4272,7 +4298,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
|
||||
"%llu-%llu\n", mdname(conf->mddev),
|
||||
(unsigned long long) sh->sector,
|
||||
(unsigned long long) sh->sector +
|
||||
STRIPE_SECTORS);
|
||||
RAID5_STRIPE_SECTORS(conf));
|
||||
} else {
|
||||
int *target = &sh->ops.target;
|
||||
|
||||
@ -4343,7 +4369,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
|
||||
/* place all the copies on one channel */
|
||||
init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
|
||||
tx = async_memcpy(sh2->dev[dd_idx].page,
|
||||
sh->dev[i].page, 0, 0, STRIPE_SIZE,
|
||||
sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf),
|
||||
&submit);
|
||||
|
||||
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
|
||||
@ -4442,8 +4468,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
*/
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
|
||||
!is_badblock(rdev, sh->sector, STRIPE_SECTORS,
|
||||
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
|
||||
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors))
|
||||
set_bit(R5_ReadRepl, &dev->flags);
|
||||
else {
|
||||
@ -4457,7 +4483,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
rdev = NULL;
|
||||
if (rdev) {
|
||||
is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
|
||||
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors);
|
||||
if (s->blocked_rdev == NULL
|
||||
&& (test_bit(Blocked, &rdev->flags)
|
||||
@ -4484,7 +4510,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
}
|
||||
} else if (test_bit(In_sync, &rdev->flags))
|
||||
set_bit(R5_Insync, &dev->flags);
|
||||
else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
|
||||
else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset)
|
||||
/* in sync if before recovery_offset */
|
||||
set_bit(R5_Insync, &dev->flags);
|
||||
else if (test_bit(R5_UPTODATE, &dev->flags) &&
|
||||
@ -4573,12 +4599,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
* Return '1' if this is a member of batch, or '0' if it is a lone stripe or
|
||||
* a head which can now be handled.
|
||||
*/
|
||||
static int clear_batch_ready(struct stripe_head *sh)
|
||||
{
|
||||
/* Return '1' if this is a member of batch, or
|
||||
* '0' if it is a lone stripe or a head which can now be
|
||||
* handled.
|
||||
*/
|
||||
struct stripe_head *tmp;
|
||||
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
|
||||
return (sh->batch_head && sh->batch_head != sh);
|
||||
@ -4682,6 +4708,16 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
struct r5dev *pdev, *qdev;
|
||||
|
||||
clear_bit(STRIPE_HANDLE, &sh->state);
|
||||
|
||||
/*
|
||||
* handle_stripe should not continue handle the batched stripe, only
|
||||
* the head of batch list or lone stripe can continue. Otherwise we
|
||||
* could see break_stripe_batch_list warns about the STRIPE_ACTIVE
|
||||
* is set for the batched stripe.
|
||||
*/
|
||||
if (clear_batch_ready(sh))
|
||||
return;
|
||||
|
||||
if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
|
||||
/* already being handled, ensure it gets handled
|
||||
* again when current action finishes */
|
||||
@ -4689,11 +4725,6 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
return;
|
||||
}
|
||||
|
||||
if (clear_batch_ready(sh) ) {
|
||||
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
|
||||
return;
|
||||
}
|
||||
|
||||
if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
|
||||
break_stripe_batch_list(sh, 0);
|
||||
|
||||
@ -4842,7 +4873,7 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
* or to load a block that is being partially written.
|
||||
*/
|
||||
if (s.to_read || s.non_overwrite
|
||||
|| (conf->level == 6 && s.to_write && s.failed)
|
||||
|| (s.to_write && s.failed)
|
||||
|| (s.syncing && (s.uptodate + s.compute < disks))
|
||||
|| s.replacing
|
||||
|| s.expanding)
|
||||
@ -4927,7 +4958,7 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
if ((s.syncing || s.replacing) && s.locked == 0 &&
|
||||
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
|
||||
test_bit(STRIPE_INSYNC, &sh->state)) {
|
||||
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
|
||||
clear_bit(STRIPE_SYNCING, &sh->state);
|
||||
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
@ -4946,14 +4977,11 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
if (!test_bit(R5_ReWrite, &dev->flags)) {
|
||||
set_bit(R5_Wantwrite, &dev->flags);
|
||||
set_bit(R5_ReWrite, &dev->flags);
|
||||
set_bit(R5_LOCKED, &dev->flags);
|
||||
s.locked++;
|
||||
} else {
|
||||
} else
|
||||
/* let's read it back */
|
||||
set_bit(R5_Wantread, &dev->flags);
|
||||
set_bit(R5_LOCKED, &dev->flags);
|
||||
s.locked++;
|
||||
}
|
||||
set_bit(R5_LOCKED, &dev->flags);
|
||||
s.locked++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -4995,7 +5023,7 @@ static void handle_stripe(struct stripe_head *sh)
|
||||
clear_bit(STRIPE_EXPAND_READY, &sh->state);
|
||||
atomic_dec(&conf->reshape_stripes);
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
|
||||
}
|
||||
|
||||
if (s.expanding && s.locked == 0 &&
|
||||
@ -5025,14 +5053,14 @@ finish:
|
||||
/* We own a safe reference to the rdev */
|
||||
rdev = conf->disks[i].rdev;
|
||||
if (!rdev_set_badblocks(rdev, sh->sector,
|
||||
STRIPE_SECTORS, 0))
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
|
||||
rdev = conf->disks[i].rdev;
|
||||
rdev_clear_badblocks(rdev, sh->sector,
|
||||
STRIPE_SECTORS, 0);
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
|
||||
@ -5041,7 +5069,7 @@ finish:
|
||||
/* rdev have been moved down */
|
||||
rdev = conf->disks[i].rdev;
|
||||
rdev_clear_badblocks(rdev, sh->sector,
|
||||
STRIPE_SECTORS, 0);
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
}
|
||||
@ -5483,7 +5511,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
/* Skip discard while reshape is happening */
|
||||
return;
|
||||
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
|
||||
last_sector = bio_end_sector(bi);
|
||||
|
||||
bi->bi_next = NULL;
|
||||
@ -5498,7 +5526,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
last_sector *= conf->chunk_sectors;
|
||||
|
||||
for (; logical_sector < last_sector;
|
||||
logical_sector += STRIPE_SECTORS) {
|
||||
logical_sector += RAID5_STRIPE_SECTORS(conf)) {
|
||||
DEFINE_WAIT(w);
|
||||
int d;
|
||||
again:
|
||||
@ -5543,7 +5571,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
|
||||
d++)
|
||||
md_bitmap_startwrite(mddev->bitmap,
|
||||
sh->sector,
|
||||
STRIPE_SECTORS,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
0);
|
||||
sh->bm_seq = conf->seq_flush + 1;
|
||||
set_bit(STRIPE_BIT_DELAY, &sh->state);
|
||||
@ -5608,12 +5636,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
return true;
|
||||
}
|
||||
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
|
||||
last_sector = bio_end_sector(bi);
|
||||
bi->bi_next = NULL;
|
||||
|
||||
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
|
||||
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
|
||||
for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) {
|
||||
int previous;
|
||||
int seq;
|
||||
|
||||
@ -5711,8 +5739,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
do_flush = false;
|
||||
}
|
||||
|
||||
if (!sh->batch_head || sh == sh->batch_head)
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
clear_bit(STRIPE_DELAYED, &sh->state);
|
||||
if ((!sh->batch_head || sh == sh->batch_head) &&
|
||||
(bi->bi_opf & REQ_SYNC) &&
|
||||
@ -5777,7 +5804,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
sector_div(sector_nr, new_data_disks);
|
||||
if (sector_nr) {
|
||||
mddev->curr_resync_completed = sector_nr;
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
*skipped = 1;
|
||||
retn = sector_nr;
|
||||
goto finish;
|
||||
@ -5891,11 +5918,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
conf->reshape_safe = mddev->reshape_position;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&stripes);
|
||||
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
|
||||
for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
|
||||
int j;
|
||||
int skipped_disk = 0;
|
||||
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
|
||||
@ -5916,7 +5943,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
skipped_disk = 1;
|
||||
continue;
|
||||
}
|
||||
memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
|
||||
memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf));
|
||||
set_bit(R5_Expanded, &sh->dev[j].flags);
|
||||
set_bit(R5_UPTODATE, &sh->dev[j].flags);
|
||||
}
|
||||
@ -5951,7 +5978,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
|
||||
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
|
||||
set_bit(STRIPE_HANDLE, &sh->state);
|
||||
raid5_release_stripe(sh);
|
||||
first_sector += STRIPE_SECTORS;
|
||||
first_sector += RAID5_STRIPE_SECTORS(conf);
|
||||
}
|
||||
/* Now that the sources are clearly marked, we can release
|
||||
* the destination stripes
|
||||
@ -5998,7 +6025,7 @@ finish:
|
||||
conf->reshape_safe = mddev->reshape_position;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
wake_up(&conf->wait_for_overlap);
|
||||
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
}
|
||||
ret:
|
||||
return retn;
|
||||
@ -6057,11 +6084,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
|
||||
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
|
||||
!conf->fullsync &&
|
||||
!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
|
||||
sync_blocks >= STRIPE_SECTORS) {
|
||||
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
|
||||
/* we can skip this block, and probably more */
|
||||
sync_blocks /= STRIPE_SECTORS;
|
||||
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
|
||||
*skipped = 1;
|
||||
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
|
||||
/* keep things rounded to whole stripes */
|
||||
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
|
||||
}
|
||||
|
||||
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
|
||||
@ -6094,7 +6122,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
|
||||
|
||||
raid5_release_stripe(sh);
|
||||
|
||||
return STRIPE_SECTORS;
|
||||
return RAID5_STRIPE_SECTORS(conf);
|
||||
}
|
||||
|
||||
static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
|
||||
@ -6117,14 +6145,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
|
||||
int handled = 0;
|
||||
|
||||
logical_sector = raid_bio->bi_iter.bi_sector &
|
||||
~((sector_t)STRIPE_SECTORS-1);
|
||||
~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
|
||||
sector = raid5_compute_sector(conf, logical_sector,
|
||||
0, &dd_idx, NULL);
|
||||
last_sector = bio_end_sector(raid_bio);
|
||||
|
||||
for (; logical_sector < last_sector;
|
||||
logical_sector += STRIPE_SECTORS,
|
||||
sector += STRIPE_SECTORS,
|
||||
logical_sector += RAID5_STRIPE_SECTORS(conf),
|
||||
sector += RAID5_STRIPE_SECTORS(conf),
|
||||
scnt++) {
|
||||
|
||||
if (scnt < offset)
|
||||
@ -6457,6 +6485,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
|
||||
raid5_show_rmw_level,
|
||||
raid5_store_rmw_level);
|
||||
|
||||
static ssize_t
|
||||
raid5_show_stripe_size(struct mddev *mddev, char *page)
|
||||
{
|
||||
struct r5conf *conf;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&mddev->lock);
|
||||
conf = mddev->private;
|
||||
if (conf)
|
||||
ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf));
|
||||
spin_unlock(&mddev->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
|
||||
static ssize_t
|
||||
raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
|
||||
{
|
||||
struct r5conf *conf;
|
||||
unsigned long new;
|
||||
int err;
|
||||
|
||||
if (len >= PAGE_SIZE)
|
||||
return -EINVAL;
|
||||
if (kstrtoul(page, 10, &new))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* The value should not be bigger than PAGE_SIZE. It requires to
|
||||
* be multiple of DEFAULT_STRIPE_SIZE.
|
||||
*/
|
||||
if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0)
|
||||
return -EINVAL;
|
||||
|
||||
err = mddev_lock(mddev);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
conf = mddev->private;
|
||||
if (!conf) {
|
||||
err = -ENODEV;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (new == conf->stripe_size)
|
||||
goto out_unlock;
|
||||
|
||||
pr_debug("md/raid: change stripe_size from %lu to %lu\n",
|
||||
conf->stripe_size, new);
|
||||
|
||||
mddev_suspend(mddev);
|
||||
conf->stripe_size = new;
|
||||
conf->stripe_shift = ilog2(new) - 9;
|
||||
conf->stripe_sectors = new >> 9;
|
||||
mddev_resume(mddev);
|
||||
|
||||
out_unlock:
|
||||
mddev_unlock(mddev);
|
||||
return err ?: len;
|
||||
}
|
||||
|
||||
static struct md_sysfs_entry
|
||||
raid5_stripe_size = __ATTR(stripe_size, 0644,
|
||||
raid5_show_stripe_size,
|
||||
raid5_store_stripe_size);
|
||||
#else
|
||||
static struct md_sysfs_entry
|
||||
raid5_stripe_size = __ATTR(stripe_size, 0444,
|
||||
raid5_show_stripe_size,
|
||||
NULL);
|
||||
#endif
|
||||
|
||||
static ssize_t
|
||||
raid5_show_preread_threshold(struct mddev *mddev, char *page)
|
||||
@ -6645,6 +6744,7 @@ static struct attribute *raid5_attrs[] = {
|
||||
&raid5_group_thread_cnt.attr,
|
||||
&raid5_skip_copy.attr,
|
||||
&raid5_rmw_level.attr,
|
||||
&raid5_stripe_size.attr,
|
||||
&r5c_journal_mode.attr,
|
||||
&ppl_write_hint.attr,
|
||||
NULL,
|
||||
@ -6744,7 +6844,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
|
||||
conf->previous_raid_disks),
|
||||
max(conf->chunk_sectors,
|
||||
conf->prev_chunk_sectors)
|
||||
/ STRIPE_SECTORS)) {
|
||||
/ RAID5_STRIPE_SECTORS(conf))) {
|
||||
free_scratch_buffer(conf, percpu);
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -6896,6 +6996,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
|
||||
if (conf == NULL)
|
||||
goto abort;
|
||||
|
||||
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
|
||||
conf->stripe_size = DEFAULT_STRIPE_SIZE;
|
||||
conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9;
|
||||
conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9;
|
||||
#endif
|
||||
INIT_LIST_HEAD(&conf->free_list);
|
||||
INIT_LIST_HEAD(&conf->pending_list);
|
||||
conf->pending_data = kcalloc(PENDING_IO_MAX,
|
||||
@ -7047,8 +7153,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
conf->min_nr_stripes = NR_STRIPES;
|
||||
if (mddev->reshape_position != MaxSector) {
|
||||
int stripes = max_t(int,
|
||||
((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
|
||||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
|
||||
((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4,
|
||||
((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4);
|
||||
conf->min_nr_stripes = max(NR_STRIPES, stripes);
|
||||
if (conf->min_nr_stripes != NR_STRIPES)
|
||||
pr_info("md/raid:%s: force stripe size %d for reshape\n",
|
||||
@ -7779,14 +7885,14 @@ static int check_stripe_cache(struct mddev *mddev)
|
||||
* stripe_heads first.
|
||||
*/
|
||||
struct r5conf *conf = mddev->private;
|
||||
if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
|
||||
if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
|
||||
> conf->min_nr_stripes ||
|
||||
((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
|
||||
((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4
|
||||
> conf->min_nr_stripes) {
|
||||
pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n",
|
||||
mdname(mddev),
|
||||
((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
|
||||
/ STRIPE_SIZE)*4);
|
||||
/ RAID5_STRIPE_SIZE(conf))*4);
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
@ -7922,8 +8028,8 @@ static int raid5_start_reshape(struct mddev *mddev)
|
||||
else
|
||||
rdev->recovery_offset = 0;
|
||||
|
||||
if (sysfs_link_rdev(mddev, rdev))
|
||||
/* Failure here is OK */;
|
||||
/* Failure here is OK */
|
||||
sysfs_link_rdev(mddev, rdev);
|
||||
}
|
||||
} else if (rdev->raid_disk >= conf->previous_raid_disks
|
||||
&& !test_bit(Faulty, &rdev->flags)) {
|
||||
@ -8118,7 +8224,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
|
||||
while (chunksect && (mddev->array_sectors & (chunksect-1)))
|
||||
chunksect >>= 1;
|
||||
|
||||
if ((chunksect<<9) < STRIPE_SIZE)
|
||||
if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private))
|
||||
/* array size does not allow a suitable chunk size */
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
|
@ -472,32 +472,20 @@ struct disk_info {
|
||||
*/
|
||||
|
||||
#define NR_STRIPES 256
|
||||
#define DEFAULT_STRIPE_SIZE 4096
|
||||
|
||||
#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
|
||||
#define STRIPE_SIZE PAGE_SIZE
|
||||
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
|
||||
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
|
||||
#endif
|
||||
|
||||
#define IO_THRESHOLD 1
|
||||
#define BYPASS_THRESHOLD 1
|
||||
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
|
||||
#define HASH_MASK (NR_HASH - 1)
|
||||
#define MAX_STRIPE_BATCH 8
|
||||
|
||||
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
|
||||
* order without overlap. There may be several bio's per stripe+device, and
|
||||
* a bio could span several devices.
|
||||
* When walking this list for a particular stripe+device, we must never proceed
|
||||
* beyond a bio that extends past this device, as the next bio might no longer
|
||||
* be valid.
|
||||
* This function is used to determine the 'next' bio in the list, given the
|
||||
* sector of the current stripe+device
|
||||
*/
|
||||
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
|
||||
{
|
||||
if (bio_end_sector(bio) < sector + STRIPE_SECTORS)
|
||||
return bio->bi_next;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
|
||||
* This is because we sometimes take all the spinlocks
|
||||
* and creating that much locking depth can cause
|
||||
@ -574,6 +562,11 @@ struct r5conf {
|
||||
int raid_disks;
|
||||
int max_nr_stripes;
|
||||
int min_nr_stripes;
|
||||
#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
|
||||
unsigned long stripe_size;
|
||||
unsigned int stripe_shift;
|
||||
unsigned long stripe_sectors;
|
||||
#endif
|
||||
|
||||
/* reshape_progress is the leading edge of a 'reshape'
|
||||
* It has value MaxSector when no reshape is happening
|
||||
@ -690,6 +683,32 @@ struct r5conf {
|
||||
struct r5pending_data *next_pending_data;
|
||||
};
|
||||
|
||||
#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
|
||||
#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE
|
||||
#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT
|
||||
#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS
|
||||
#else
|
||||
#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size)
|
||||
#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift)
|
||||
#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors)
|
||||
#endif
|
||||
|
||||
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
|
||||
* order without overlap. There may be several bio's per stripe+device, and
|
||||
* a bio could span several devices.
|
||||
* When walking this list for a particular stripe+device, we must never proceed
|
||||
* beyond a bio that extends past this device, as the next bio might no longer
|
||||
* be valid.
|
||||
* This function is used to determine the 'next' bio in the list, given the
|
||||
* sector of the current stripe+device
|
||||
*/
|
||||
static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
|
||||
{
|
||||
if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
|
||||
return bio->bi_next;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Our supported algorithms
|
||||
|
@ -13,6 +13,7 @@ nvme-core-y := core.o
|
||||
nvme-core-$(CONFIG_TRACING) += trace.o
|
||||
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
|
||||
nvme-core-$(CONFIG_NVM) += lightnvm.o
|
||||
nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
|
||||
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
|
||||
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
|
||||
|
||||
|
@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
|
||||
static struct class *nvme_class;
|
||||
static struct class *nvme_subsys_class;
|
||||
|
||||
static int nvme_revalidate_disk(struct gendisk *disk);
|
||||
static int _nvme_revalidate_disk(struct gendisk *disk);
|
||||
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
|
||||
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
|
||||
unsigned nsid);
|
||||
@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
|
||||
* Revalidating a dead namespace sets capacity to 0. This will end
|
||||
* buffered writers dirtying pages that can't be synced.
|
||||
*/
|
||||
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
|
||||
if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
|
||||
return;
|
||||
blk_set_queue_dying(ns->queue);
|
||||
/* Forcibly unquiesce queues to avoid blocking dispatch */
|
||||
@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
|
||||
nvme_retry_req(req);
|
||||
return;
|
||||
}
|
||||
} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
|
||||
req_op(req) == REQ_OP_ZONE_APPEND) {
|
||||
req->__sector = nvme_lba_to_sect(req->q->queuedata,
|
||||
le64_to_cpu(nvme_req(req)->result.u64));
|
||||
}
|
||||
|
||||
nvme_trace_bio_complete(req, status);
|
||||
@ -362,6 +366,16 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case NVME_CTRL_DELETING_NOIO:
|
||||
switch (old_state) {
|
||||
case NVME_CTRL_DELETING:
|
||||
case NVME_CTRL_DEAD:
|
||||
changed = true;
|
||||
/* FALLTHRU */
|
||||
default:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case NVME_CTRL_DEAD:
|
||||
switch (old_state) {
|
||||
case NVME_CTRL_DELETING:
|
||||
@ -399,6 +413,7 @@ static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
|
||||
case NVME_CTRL_CONNECTING:
|
||||
return false;
|
||||
case NVME_CTRL_DELETING:
|
||||
case NVME_CTRL_DELETING_NOIO:
|
||||
case NVME_CTRL_DEAD:
|
||||
return true;
|
||||
default:
|
||||
@ -450,10 +465,11 @@ static void nvme_free_ns(struct kref *kref)
|
||||
kfree(ns);
|
||||
}
|
||||
|
||||
static void nvme_put_ns(struct nvme_ns *ns)
|
||||
void nvme_put_ns(struct nvme_ns *ns)
|
||||
{
|
||||
kref_put(&ns->kref, nvme_free_ns);
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
|
||||
|
||||
static inline void nvme_clear_nvme_request(struct request *req)
|
||||
{
|
||||
@ -555,7 +571,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
|
||||
goto out_disable_stream;
|
||||
}
|
||||
|
||||
ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
|
||||
ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
|
||||
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
|
||||
return 0;
|
||||
|
||||
@ -589,6 +605,14 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
|
||||
req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
|
||||
}
|
||||
|
||||
static void nvme_setup_passthrough(struct request *req,
|
||||
struct nvme_command *cmd)
|
||||
{
|
||||
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
|
||||
/* passthru commands should let the driver set the SGL flags */
|
||||
cmd->common.flags &= ~NVME_CMD_SGL_ALL;
|
||||
}
|
||||
|
||||
static inline void nvme_setup_flush(struct nvme_ns *ns,
|
||||
struct nvme_command *cmnd)
|
||||
{
|
||||
@ -673,7 +697,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
||||
}
|
||||
|
||||
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
struct request *req, struct nvme_command *cmnd)
|
||||
struct request *req, struct nvme_command *cmnd,
|
||||
enum nvme_opcode op)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
u16 control = 0;
|
||||
@ -687,7 +712,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
if (req->cmd_flags & REQ_RAHEAD)
|
||||
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
|
||||
|
||||
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
|
||||
cmnd->rw.opcode = op;
|
||||
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
||||
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
||||
@ -716,6 +741,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
case NVME_NS_DPS_PI_TYPE2:
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD |
|
||||
NVME_RW_PRINFO_PRCHK_REF;
|
||||
if (op == nvme_cmd_zone_append)
|
||||
control |= NVME_RW_APPEND_PIREMAP;
|
||||
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
|
||||
break;
|
||||
}
|
||||
@ -751,11 +778,24 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
|
||||
switch (req_op(req)) {
|
||||
case REQ_OP_DRV_IN:
|
||||
case REQ_OP_DRV_OUT:
|
||||
memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
|
||||
nvme_setup_passthrough(req, cmd);
|
||||
break;
|
||||
case REQ_OP_FLUSH:
|
||||
nvme_setup_flush(ns, cmd);
|
||||
break;
|
||||
case REQ_OP_ZONE_RESET_ALL:
|
||||
case REQ_OP_ZONE_RESET:
|
||||
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
|
||||
break;
|
||||
case REQ_OP_ZONE_OPEN:
|
||||
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
|
||||
break;
|
||||
case REQ_OP_ZONE_CLOSE:
|
||||
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
|
||||
break;
|
||||
case REQ_OP_ZONE_FINISH:
|
||||
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
ret = nvme_setup_write_zeroes(ns, req, cmd);
|
||||
break;
|
||||
@ -763,8 +803,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
|
||||
ret = nvme_setup_discard(ns, req, cmd);
|
||||
break;
|
||||
case REQ_OP_READ:
|
||||
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
|
||||
break;
|
||||
case REQ_OP_WRITE:
|
||||
ret = nvme_setup_rw(ns, req, cmd);
|
||||
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
|
||||
break;
|
||||
case REQ_OP_ZONE_APPEND:
|
||||
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
|
||||
break;
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
@ -884,6 +929,120 @@ out:
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static u32 nvme_known_admin_effects(u8 opcode)
|
||||
{
|
||||
switch (opcode) {
|
||||
case nvme_admin_format_nvm:
|
||||
return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
|
||||
NVME_CMD_EFFECTS_CSE_MASK;
|
||||
case nvme_admin_sanitize_nvm:
|
||||
return NVME_CMD_EFFECTS_CSE_MASK;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
|
||||
{
|
||||
u32 effects = 0;
|
||||
|
||||
if (ns) {
|
||||
if (ns->head->effects)
|
||||
effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
|
||||
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
|
||||
dev_warn(ctrl->device,
|
||||
"IO command:%02x has unhandled effects:%08x\n",
|
||||
opcode, effects);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ctrl->effects)
|
||||
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
|
||||
effects |= nvme_known_admin_effects(opcode);
|
||||
|
||||
return effects;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
|
||||
|
||||
static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
u8 opcode)
|
||||
{
|
||||
u32 effects = nvme_command_effects(ctrl, ns, opcode);
|
||||
|
||||
/*
|
||||
* For simplicity, IO to all namespaces is quiesced even if the command
|
||||
* effects say only one namespace is affected.
|
||||
*/
|
||||
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
|
||||
mutex_lock(&ctrl->scan_lock);
|
||||
mutex_lock(&ctrl->subsys->lock);
|
||||
nvme_mpath_start_freeze(ctrl->subsys);
|
||||
nvme_mpath_wait_freeze(ctrl->subsys);
|
||||
nvme_start_freeze(ctrl);
|
||||
nvme_wait_freeze(ctrl);
|
||||
}
|
||||
return effects;
|
||||
}
|
||||
|
||||
static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
||||
down_read(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
if (_nvme_revalidate_disk(ns->disk))
|
||||
nvme_set_queue_dying(ns);
|
||||
else if (blk_queue_is_zoned(ns->disk->queue)) {
|
||||
/*
|
||||
* IO commands are required to fully revalidate a zoned
|
||||
* device. Force the command effects to trigger rescan
|
||||
* work so report zones can run in a context with
|
||||
* unfrozen IO queues.
|
||||
*/
|
||||
*effects |= NVME_CMD_EFFECTS_NCC;
|
||||
}
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
}
|
||||
|
||||
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
|
||||
{
|
||||
/*
|
||||
* Revalidate LBA changes prior to unfreezing. This is necessary to
|
||||
* prevent memory corruption if a logical block size was changed by
|
||||
* this command.
|
||||
*/
|
||||
if (effects & NVME_CMD_EFFECTS_LBCC)
|
||||
nvme_update_formats(ctrl, &effects);
|
||||
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
|
||||
nvme_unfreeze(ctrl);
|
||||
nvme_mpath_unfreeze(ctrl->subsys);
|
||||
mutex_unlock(&ctrl->subsys->lock);
|
||||
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
|
||||
mutex_unlock(&ctrl->scan_lock);
|
||||
}
|
||||
if (effects & NVME_CMD_EFFECTS_CCC)
|
||||
nvme_init_identify(ctrl);
|
||||
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
|
||||
nvme_queue_scan(ctrl);
|
||||
flush_work(&ctrl->scan_work);
|
||||
}
|
||||
}
|
||||
|
||||
void nvme_execute_passthru_rq(struct request *rq)
|
||||
{
|
||||
struct nvme_command *cmd = nvme_req(rq)->cmd;
|
||||
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
|
||||
struct nvme_ns *ns = rq->q->queuedata;
|
||||
struct gendisk *disk = ns ? ns->disk : NULL;
|
||||
u32 effects;
|
||||
|
||||
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
|
||||
blk_execute_rq(rq->q, disk, rq, 0);
|
||||
nvme_passthru_end(ctrl, effects);
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
|
||||
|
||||
static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
struct nvme_command *cmd, void __user *ubuffer,
|
||||
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
|
||||
@ -922,7 +1081,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
}
|
||||
}
|
||||
|
||||
blk_execute_rq(req->q, disk, req, 0);
|
||||
nvme_execute_passthru_rq(req);
|
||||
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
|
||||
ret = -EINTR;
|
||||
else
|
||||
@ -1056,8 +1215,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
|
||||
return error;
|
||||
}
|
||||
|
||||
static bool nvme_multi_css(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
|
||||
}
|
||||
|
||||
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
|
||||
struct nvme_ns_id_desc *cur)
|
||||
struct nvme_ns_id_desc *cur, bool *csi_seen)
|
||||
{
|
||||
const char *warn_str = "ctrl returned bogus length:";
|
||||
void *data = cur;
|
||||
@ -1087,6 +1251,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
|
||||
}
|
||||
uuid_copy(&ids->uuid, data + sizeof(*cur));
|
||||
return NVME_NIDT_UUID_LEN;
|
||||
case NVME_NIDT_CSI:
|
||||
if (cur->nidl != NVME_NIDT_CSI_LEN) {
|
||||
dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
|
||||
warn_str, cur->nidl);
|
||||
return -1;
|
||||
}
|
||||
memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
|
||||
*csi_seen = true;
|
||||
return NVME_NIDT_CSI_LEN;
|
||||
default:
|
||||
/* Skip unknown types */
|
||||
return cur->nidl;
|
||||
@ -1097,10 +1270,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
|
||||
struct nvme_ns_ids *ids)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
int status;
|
||||
bool csi_seen = false;
|
||||
int status, pos, len;
|
||||
void *data;
|
||||
int pos;
|
||||
int len;
|
||||
|
||||
if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
|
||||
return 0;
|
||||
@ -1127,12 +1299,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
|
||||
if (cur->nidl == 0)
|
||||
break;
|
||||
|
||||
len = nvme_process_ns_desc(ctrl, ids, cur);
|
||||
len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
|
||||
if (len < 0)
|
||||
goto free_data;
|
||||
break;
|
||||
|
||||
len += sizeof(*cur);
|
||||
}
|
||||
|
||||
if (nvme_multi_css(ctrl) && !csi_seen) {
|
||||
dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
|
||||
nsid);
|
||||
status = -EINVAL;
|
||||
}
|
||||
|
||||
free_data:
|
||||
kfree(data);
|
||||
return status;
|
||||
@ -1321,96 +1500,12 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
||||
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
|
||||
}
|
||||
|
||||
static u32 nvme_known_admin_effects(u8 opcode)
|
||||
{
|
||||
switch (opcode) {
|
||||
case nvme_admin_format_nvm:
|
||||
return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
|
||||
NVME_CMD_EFFECTS_CSE_MASK;
|
||||
case nvme_admin_sanitize_nvm:
|
||||
return NVME_CMD_EFFECTS_CSE_MASK;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
u8 opcode)
|
||||
{
|
||||
u32 effects = 0;
|
||||
|
||||
if (ns) {
|
||||
if (ctrl->effects)
|
||||
effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
|
||||
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
|
||||
dev_warn(ctrl->device,
|
||||
"IO command:%02x has unhandled effects:%08x\n",
|
||||
opcode, effects);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (ctrl->effects)
|
||||
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
|
||||
effects |= nvme_known_admin_effects(opcode);
|
||||
|
||||
/*
|
||||
* For simplicity, IO to all namespaces is quiesced even if the command
|
||||
* effects say only one namespace is affected.
|
||||
*/
|
||||
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
|
||||
mutex_lock(&ctrl->scan_lock);
|
||||
mutex_lock(&ctrl->subsys->lock);
|
||||
nvme_mpath_start_freeze(ctrl->subsys);
|
||||
nvme_mpath_wait_freeze(ctrl->subsys);
|
||||
nvme_start_freeze(ctrl);
|
||||
nvme_wait_freeze(ctrl);
|
||||
}
|
||||
return effects;
|
||||
}
|
||||
|
||||
static void nvme_update_formats(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_ns *ns;
|
||||
|
||||
down_read(&ctrl->namespaces_rwsem);
|
||||
list_for_each_entry(ns, &ctrl->namespaces, list)
|
||||
if (ns->disk && nvme_revalidate_disk(ns->disk))
|
||||
nvme_set_queue_dying(ns);
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
}
|
||||
|
||||
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
|
||||
{
|
||||
/*
|
||||
* Revalidate LBA changes prior to unfreezing. This is necessary to
|
||||
* prevent memory corruption if a logical block size was changed by
|
||||
* this command.
|
||||
*/
|
||||
if (effects & NVME_CMD_EFFECTS_LBCC)
|
||||
nvme_update_formats(ctrl);
|
||||
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
|
||||
nvme_unfreeze(ctrl);
|
||||
nvme_mpath_unfreeze(ctrl->subsys);
|
||||
mutex_unlock(&ctrl->subsys->lock);
|
||||
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
|
||||
mutex_unlock(&ctrl->scan_lock);
|
||||
}
|
||||
if (effects & NVME_CMD_EFFECTS_CCC)
|
||||
nvme_init_identify(ctrl);
|
||||
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
|
||||
nvme_queue_scan(ctrl);
|
||||
flush_work(&ctrl->scan_work);
|
||||
}
|
||||
}
|
||||
|
||||
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct nvme_passthru_cmd __user *ucmd)
|
||||
{
|
||||
struct nvme_passthru_cmd cmd;
|
||||
struct nvme_command c;
|
||||
unsigned timeout = 0;
|
||||
u32 effects;
|
||||
u64 result;
|
||||
int status;
|
||||
|
||||
@ -1437,12 +1532,10 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
if (cmd.timeout_ms)
|
||||
timeout = msecs_to_jiffies(cmd.timeout_ms);
|
||||
|
||||
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
|
||||
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
|
||||
nvme_to_user_ptr(cmd.addr), cmd.data_len,
|
||||
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
|
||||
0, &result, timeout);
|
||||
nvme_passthru_end(ctrl, effects);
|
||||
|
||||
if (status >= 0) {
|
||||
if (put_user(result, &ucmd->result))
|
||||
@ -1458,7 +1551,6 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct nvme_passthru_cmd64 cmd;
|
||||
struct nvme_command c;
|
||||
unsigned timeout = 0;
|
||||
u32 effects;
|
||||
int status;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
@ -1484,12 +1576,10 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
if (cmd.timeout_ms)
|
||||
timeout = msecs_to_jiffies(cmd.timeout_ms);
|
||||
|
||||
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
|
||||
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
|
||||
nvme_to_user_ptr(cmd.addr), cmd.data_len,
|
||||
nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
|
||||
0, &cmd.result, timeout);
|
||||
nvme_passthru_end(ctrl, effects);
|
||||
|
||||
if (status >= 0) {
|
||||
if (put_user(cmd.result, &ucmd->result))
|
||||
@ -1503,7 +1593,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
* Issue ioctl requests on the first available path. Note that unlike normal
|
||||
* block layer requests we will not retry failed request on another controller.
|
||||
*/
|
||||
static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
|
||||
struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
|
||||
struct nvme_ns_head **head, int *srcu_idx)
|
||||
{
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
@ -1523,7 +1613,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
|
||||
return disk->private_data;
|
||||
}
|
||||
|
||||
static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
|
||||
void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
|
||||
{
|
||||
if (head)
|
||||
srcu_read_unlock(&head->srcu, idx);
|
||||
@ -1789,7 +1879,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
|
||||
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
|
||||
if (ctrl->vs >= NVME_VS(1, 2, 0))
|
||||
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
|
||||
if (ctrl->vs >= NVME_VS(1, 3, 0))
|
||||
if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
|
||||
return nvme_identify_ns_descs(ctrl, nsid, ids);
|
||||
return 0;
|
||||
}
|
||||
@ -1805,7 +1895,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
|
||||
{
|
||||
return uuid_equal(&a->uuid, &b->uuid) &&
|
||||
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
|
||||
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
|
||||
memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
|
||||
a->csi == b->csi;
|
||||
}
|
||||
|
||||
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
@ -1915,18 +2006,38 @@ static void nvme_update_disk_info(struct gendisk *disk,
|
||||
|
||||
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
|
||||
{
|
||||
unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
|
||||
struct nvme_ns *ns = disk->private_data;
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
int ret;
|
||||
u32 iob;
|
||||
|
||||
/*
|
||||
* If identify namespace failed, use default 512 byte block size so
|
||||
* block layer can use before failing read/write for 0 capacity.
|
||||
*/
|
||||
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
|
||||
ns->lba_shift = id->lbaf[lbaf].ds;
|
||||
if (ns->lba_shift == 0)
|
||||
ns->lba_shift = 9;
|
||||
|
||||
switch (ns->head->ids.csi) {
|
||||
case NVME_CSI_NVM:
|
||||
break;
|
||||
case NVME_CSI_ZNS:
|
||||
ret = nvme_update_zone_info(disk, ns, lbaf);
|
||||
if (ret) {
|
||||
dev_warn(ctrl->device,
|
||||
"failed to add zoned namespace:%u ret:%d\n",
|
||||
ns->head->ns_id, ret);
|
||||
return ret;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
|
||||
ns->head->ids.csi, ns->head->ns_id);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
|
||||
is_power_of_2(ctrl->max_hw_sectors))
|
||||
iob = ctrl->max_hw_sectors;
|
||||
@ -1934,7 +2045,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
|
||||
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
|
||||
|
||||
ns->features = 0;
|
||||
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
|
||||
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
|
||||
/* the PI implementation requires metadata equal t10 pi tuple size */
|
||||
if (ns->ms == sizeof(struct t10_pi_tuple))
|
||||
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
|
||||
@ -1977,7 +2088,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_revalidate_disk(struct gendisk *disk)
|
||||
static int _nvme_revalidate_disk(struct gendisk *disk)
|
||||
{
|
||||
struct nvme_ns *ns = disk->private_data;
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
@ -2025,6 +2136,28 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_revalidate_disk(struct gendisk *disk)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = _nvme_revalidate_disk(disk);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
if (blk_queue_is_zoned(disk->queue)) {
|
||||
struct nvme_ns *ns = disk->private_data;
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
|
||||
ret = blk_revalidate_disk_zones(disk, NULL);
|
||||
if (!ret)
|
||||
blk_queue_max_zone_append_sectors(disk->queue,
|
||||
ctrl->max_zone_append);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char nvme_pr_type(enum pr_type type)
|
||||
{
|
||||
switch (type) {
|
||||
@ -2155,6 +2288,7 @@ static const struct block_device_operations nvme_fops = {
|
||||
.release = nvme_release,
|
||||
.getgeo = nvme_getgeo,
|
||||
.revalidate_disk= nvme_revalidate_disk,
|
||||
.report_zones = nvme_report_zones,
|
||||
.pr_ops = &nvme_pr_ops,
|
||||
};
|
||||
|
||||
@ -2181,6 +2315,7 @@ const struct block_device_operations nvme_ns_head_ops = {
|
||||
.ioctl = nvme_ioctl,
|
||||
.compat_ioctl = nvme_compat_ioctl,
|
||||
.getgeo = nvme_getgeo,
|
||||
.report_zones = nvme_report_zones,
|
||||
.pr_ops = &nvme_pr_ops,
|
||||
};
|
||||
#endif /* CONFIG_NVME_MULTIPATH */
|
||||
@ -2238,12 +2373,7 @@ EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
|
||||
|
||||
int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
/*
|
||||
* Default to a 4K page size, with the intention to update this
|
||||
* path in the future to accomodate architectures with differing
|
||||
* kernel and IO page sizes.
|
||||
*/
|
||||
unsigned dev_page_min, page_shift = 12;
|
||||
unsigned dev_page_min;
|
||||
int ret;
|
||||
|
||||
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
|
||||
@ -2253,17 +2383,18 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
||||
|
||||
if (page_shift < dev_page_min) {
|
||||
if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
|
||||
dev_err(ctrl->device,
|
||||
"Minimum device page size %u too large for host (%u)\n",
|
||||
1 << dev_page_min, 1 << page_shift);
|
||||
1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
ctrl->page_size = 1 << page_shift;
|
||||
|
||||
ctrl->ctrl_config = NVME_CC_CSS_NVM;
|
||||
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
|
||||
if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
|
||||
ctrl->ctrl_config = NVME_CC_CSS_CSI;
|
||||
else
|
||||
ctrl->ctrl_config = NVME_CC_CSS_NVM;
|
||||
ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
|
||||
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
|
||||
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
|
||||
ctrl->ctrl_config |= NVME_CC_ENABLE;
|
||||
@ -2313,13 +2444,13 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
|
||||
|
||||
if (ctrl->max_hw_sectors) {
|
||||
u32 max_segments =
|
||||
(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
|
||||
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
|
||||
|
||||
max_segments = min_not_zero(max_segments, ctrl->max_segments);
|
||||
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
|
||||
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
|
||||
}
|
||||
blk_queue_virt_boundary(q, ctrl->page_size - 1);
|
||||
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
|
||||
blk_queue_dma_alignment(q, 7);
|
||||
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
|
||||
vwc = true;
|
||||
@ -2810,7 +2941,7 @@ out_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
||||
void *log, size_t size, u64 offset)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
@ -2824,27 +2955,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
|
||||
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
|
||||
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
|
||||
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
|
||||
c.get_log_page.csi = csi;
|
||||
|
||||
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
|
||||
}
|
||||
|
||||
static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
|
||||
static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
|
||||
{
|
||||
struct nvme_cel *cel, *ret = NULL;
|
||||
|
||||
spin_lock(&ctrl->lock);
|
||||
list_for_each_entry(cel, &ctrl->cels, entry) {
|
||||
if (cel->csi == csi) {
|
||||
ret = cel;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock(&ctrl->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
|
||||
struct nvme_effects_log **log)
|
||||
{
|
||||
struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
|
||||
int ret;
|
||||
|
||||
if (!ctrl->effects)
|
||||
ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
|
||||
if (cel)
|
||||
goto out;
|
||||
|
||||
if (!ctrl->effects)
|
||||
return 0;
|
||||
cel = kzalloc(sizeof(*cel), GFP_KERNEL);
|
||||
if (!cel)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
|
||||
ctrl->effects, sizeof(*ctrl->effects), 0);
|
||||
ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
|
||||
&cel->log, sizeof(cel->log), 0);
|
||||
if (ret) {
|
||||
kfree(ctrl->effects);
|
||||
ctrl->effects = NULL;
|
||||
kfree(cel);
|
||||
return ret;
|
||||
}
|
||||
return ret;
|
||||
|
||||
cel->csi = csi;
|
||||
|
||||
spin_lock(&ctrl->lock);
|
||||
list_add_tail(&cel->entry, &ctrl->cels);
|
||||
spin_unlock(&ctrl->lock);
|
||||
out:
|
||||
*log = &cel->log;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2865,7 +3024,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
return ret;
|
||||
}
|
||||
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
||||
ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
||||
ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
|
||||
|
||||
if (ctrl->vs >= NVME_VS(1, 1, 0))
|
||||
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
|
||||
@ -2877,7 +3036,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
|
||||
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
|
||||
ret = nvme_get_effects_log(ctrl);
|
||||
ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
|
||||
if (ret < 0)
|
||||
goto out_free;
|
||||
}
|
||||
@ -2939,7 +3098,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
|
||||
if (id->rtd3e) {
|
||||
/* us -> s */
|
||||
u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
|
||||
u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
|
||||
|
||||
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
|
||||
shutdown_timeout, 60);
|
||||
@ -3345,6 +3504,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
|
||||
[NVME_CTRL_RESETTING] = "resetting",
|
||||
[NVME_CTRL_CONNECTING] = "connecting",
|
||||
[NVME_CTRL_DELETING] = "deleting",
|
||||
[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
|
||||
[NVME_CTRL_DEAD] = "dead",
|
||||
};
|
||||
|
||||
@ -3397,6 +3557,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
|
||||
}
|
||||
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
|
||||
|
||||
static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
||||
struct nvmf_ctrl_options *opts = ctrl->opts;
|
||||
|
||||
if (ctrl->opts->max_reconnects == -1)
|
||||
return sprintf(buf, "off\n");
|
||||
return sprintf(buf, "%d\n",
|
||||
opts->max_reconnects * opts->reconnect_delay);
|
||||
}
|
||||
|
||||
static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t count)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
||||
struct nvmf_ctrl_options *opts = ctrl->opts;
|
||||
int ctrl_loss_tmo, err;
|
||||
|
||||
err = kstrtoint(buf, 10, &ctrl_loss_tmo);
|
||||
if (err)
|
||||
return -EINVAL;
|
||||
|
||||
else if (ctrl_loss_tmo < 0)
|
||||
opts->max_reconnects = -1;
|
||||
else
|
||||
opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
|
||||
opts->reconnect_delay);
|
||||
return count;
|
||||
}
|
||||
static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
|
||||
nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
|
||||
|
||||
static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
||||
|
||||
if (ctrl->opts->reconnect_delay == -1)
|
||||
return sprintf(buf, "off\n");
|
||||
return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
|
||||
}
|
||||
|
||||
static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t count)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
||||
unsigned int v;
|
||||
int err;
|
||||
|
||||
err = kstrtou32(buf, 10, &v);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
ctrl->opts->reconnect_delay = v;
|
||||
return count;
|
||||
}
|
||||
static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
|
||||
nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
|
||||
|
||||
static struct attribute *nvme_dev_attrs[] = {
|
||||
&dev_attr_reset_controller.attr,
|
||||
&dev_attr_rescan_controller.attr,
|
||||
@ -3414,6 +3634,8 @@ static struct attribute *nvme_dev_attrs[] = {
|
||||
&dev_attr_sqsize.attr,
|
||||
&dev_attr_hostnqn.attr,
|
||||
&dev_attr_hostid.attr,
|
||||
&dev_attr_ctrl_loss_tmo.attr,
|
||||
&dev_attr_reconnect_delay.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
@ -3510,6 +3732,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
||||
goto out_cleanup_srcu;
|
||||
}
|
||||
|
||||
if (head->ids.csi) {
|
||||
ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
|
||||
if (ret)
|
||||
goto out_cleanup_srcu;
|
||||
} else
|
||||
head->effects = ctrl->effects;
|
||||
|
||||
ret = nvme_mpath_alloc_disk(ctrl, head);
|
||||
if (ret)
|
||||
goto out_cleanup_srcu;
|
||||
@ -3591,7 +3820,7 @@ static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
|
||||
return nsa->head->ns_id - nsb->head->ns_id;
|
||||
}
|
||||
|
||||
static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
{
|
||||
struct nvme_ns *ns, *ret = NULL;
|
||||
|
||||
@ -3609,6 +3838,7 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
up_read(&ctrl->namespaces_rwsem);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
|
||||
|
||||
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
{
|
||||
@ -3726,7 +3956,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
|
||||
nvme_mpath_clear_current_path(ns);
|
||||
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
|
||||
|
||||
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
|
||||
if (ns->disk->flags & GENHD_FL_UP) {
|
||||
del_gendisk(ns->disk);
|
||||
blk_cleanup_queue(ns->queue);
|
||||
if (blk_get_integrity(ns->disk))
|
||||
@ -3757,7 +3987,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
|
||||
|
||||
ns = nvme_find_get_ns(ctrl, nsid);
|
||||
if (ns) {
|
||||
if (ns->disk && revalidate_disk(ns->disk))
|
||||
if (revalidate_disk(ns->disk))
|
||||
nvme_ns_remove(ns);
|
||||
nvme_put_ns(ns);
|
||||
} else
|
||||
@ -3850,8 +4080,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
|
||||
* raced with us in reading the log page, which could cause us to miss
|
||||
* updates.
|
||||
*/
|
||||
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
|
||||
log_size, 0);
|
||||
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
|
||||
NVME_CSI_NVM, log, log_size, 0);
|
||||
if (error)
|
||||
dev_warn(ctrl->device,
|
||||
"reading changed ns log failed: %d\n", error);
|
||||
@ -3912,6 +4142,9 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
|
||||
if (ctrl->state == NVME_CTRL_DEAD)
|
||||
nvme_kill_queues(ctrl);
|
||||
|
||||
/* this is a no-op when called from the controller reset handler */
|
||||
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
|
||||
|
||||
down_write(&ctrl->namespaces_rwsem);
|
||||
list_splice_init(&ctrl->namespaces, &ns_list);
|
||||
up_write(&ctrl->namespaces_rwsem);
|
||||
@ -3995,8 +4228,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
|
||||
if (!log)
|
||||
return;
|
||||
|
||||
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
|
||||
sizeof(*log), 0))
|
||||
if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
|
||||
log, sizeof(*log), 0))
|
||||
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
|
||||
kfree(log);
|
||||
}
|
||||
@ -4106,8 +4339,7 @@ EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
|
||||
|
||||
void nvme_start_ctrl(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
if (ctrl->kato)
|
||||
nvme_start_keep_alive(ctrl);
|
||||
nvme_start_keep_alive(ctrl);
|
||||
|
||||
nvme_enable_aen(ctrl);
|
||||
|
||||
@ -4133,11 +4365,16 @@ static void nvme_free_ctrl(struct device *dev)
|
||||
struct nvme_ctrl *ctrl =
|
||||
container_of(dev, struct nvme_ctrl, ctrl_device);
|
||||
struct nvme_subsystem *subsys = ctrl->subsys;
|
||||
struct nvme_cel *cel, *next;
|
||||
|
||||
if (subsys && ctrl->instance != subsys->instance)
|
||||
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
|
||||
|
||||
kfree(ctrl->effects);
|
||||
list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
|
||||
list_del(&cel->entry);
|
||||
kfree(cel);
|
||||
}
|
||||
|
||||
nvme_mpath_uninit(ctrl);
|
||||
__free_page(ctrl->discard_page);
|
||||
|
||||
@ -4168,6 +4405,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
||||
spin_lock_init(&ctrl->lock);
|
||||
mutex_init(&ctrl->scan_lock);
|
||||
INIT_LIST_HEAD(&ctrl->namespaces);
|
||||
INIT_LIST_HEAD(&ctrl->cels);
|
||||
init_rwsem(&ctrl->namespaces_rwsem);
|
||||
ctrl->dev = dev;
|
||||
ctrl->ops = ops;
|
||||
@ -4346,6 +4584,29 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nvme_sync_queues);
|
||||
|
||||
struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path)
|
||||
{
|
||||
struct nvme_ctrl *ctrl;
|
||||
struct file *f;
|
||||
|
||||
f = filp_open(path, O_RDWR, 0);
|
||||
if (IS_ERR(f))
|
||||
return ERR_CAST(f);
|
||||
|
||||
if (f->f_op != &nvme_dev_fops) {
|
||||
ctrl = ERR_PTR(-EINVAL);
|
||||
goto out_close;
|
||||
}
|
||||
|
||||
ctrl = f->private_data;
|
||||
nvme_get_ctrl(ctrl);
|
||||
|
||||
out_close:
|
||||
filp_close(f, NULL);
|
||||
return ctrl;
|
||||
}
|
||||
EXPORT_SYMBOL_NS_GPL(nvme_ctrl_get_by_path, NVME_TARGET_PASSTHRU);
|
||||
|
||||
/*
|
||||
* Check we didn't inadvertently grow the command structure sizes:
|
||||
*/
|
||||
@ -4364,6 +4625,8 @@ static inline void _nvme_check_size(void)
|
||||
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
|
||||
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
|
||||
|
@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
|
||||
blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
|
||||
struct request *rq)
|
||||
{
|
||||
if (ctrl->state != NVME_CTRL_DELETING &&
|
||||
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
|
||||
ctrl->state != NVME_CTRL_DEAD &&
|
||||
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
|
||||
return BLK_STS_RESOURCE;
|
||||
|
@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
|
||||
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
|
||||
bool queue_live)
|
||||
{
|
||||
if (likely(ctrl->state == NVME_CTRL_LIVE))
|
||||
if (likely(ctrl->state == NVME_CTRL_LIVE ||
|
||||
ctrl->state == NVME_CTRL_DELETING))
|
||||
return true;
|
||||
return __nvmf_check_ready(ctrl, rq, queue_live);
|
||||
}
|
||||
|
@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
|
||||
break;
|
||||
|
||||
case NVME_CTRL_DELETING:
|
||||
case NVME_CTRL_DELETING_NOIO:
|
||||
default:
|
||||
/* no action to take - let it delete */
|
||||
break;
|
||||
@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
|
||||
if (ret)
|
||||
goto out_disconnect_admin_queue;
|
||||
|
||||
ctrl->ctrl.max_hw_sectors =
|
||||
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
|
||||
ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments;
|
||||
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
|
||||
(ilog2(SZ_4K) - 9);
|
||||
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
|
@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
|
||||
int ret;
|
||||
|
||||
ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
|
||||
&data->log, sizeof(data->log), 0);
|
||||
NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
|
||||
|
||||
return ret <= 0 ? ret : -EIO;
|
||||
}
|
||||
@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
|
||||
|
||||
err = nvme_hwmon_get_smart_log(data);
|
||||
if (err) {
|
||||
dev_warn(dev, "Failed to read smart log (error %d)\n", err);
|
||||
dev_warn(ctrl->device,
|
||||
"Failed to read smart log (error %d)\n", err);
|
||||
devm_kfree(dev, data);
|
||||
return;
|
||||
}
|
||||
|
@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
|
||||
dev_meta_off = dev_meta;
|
||||
|
||||
ret = nvme_get_log(ctrl, ns->head->ns_id,
|
||||
NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
|
||||
offset);
|
||||
NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM,
|
||||
dev_meta, len, offset);
|
||||
if (ret) {
|
||||
dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
|
||||
break;
|
||||
|
@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
|
||||
|
||||
static bool nvme_path_is_disabled(struct nvme_ns *ns)
|
||||
{
|
||||
return ns->ctrl->state != NVME_CTRL_LIVE ||
|
||||
test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
|
||||
test_bit(NVME_NS_REMOVING, &ns->flags);
|
||||
/*
|
||||
* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
|
||||
* still be able to complete assuming that the controller is connected.
|
||||
* Otherwise it will fail immediately and return to the requeue list.
|
||||
*/
|
||||
if (ns->ctrl->state != NVME_CTRL_LIVE &&
|
||||
ns->ctrl->state != NVME_CTRL_DELETING)
|
||||
return true;
|
||||
if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
|
||||
test_bit(NVME_NS_REMOVING, &ns->flags))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
|
||||
@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
|
||||
fallback = ns;
|
||||
}
|
||||
|
||||
/* No optimized path found, re-check the current path */
|
||||
if (!nvme_path_is_disabled(old) &&
|
||||
old->ana_state == NVME_ANA_OPTIMIZED) {
|
||||
found = old;
|
||||
goto out;
|
||||
}
|
||||
if (!fallback)
|
||||
return NULL;
|
||||
found = fallback;
|
||||
@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
|
||||
struct nvme_ns *ns;
|
||||
|
||||
ns = srcu_dereference(head->current_path[node], &head->srcu);
|
||||
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
|
||||
ns = nvme_round_robin_path(head, node, ns);
|
||||
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
|
||||
ns = __nvme_find_path(head, node);
|
||||
if (unlikely(!ns))
|
||||
return __nvme_find_path(head, node);
|
||||
|
||||
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
|
||||
return nvme_round_robin_path(head, node, ns);
|
||||
if (unlikely(!nvme_path_is_optimized(ns)))
|
||||
return __nvme_find_path(head, node);
|
||||
return ns;
|
||||
}
|
||||
|
||||
@ -527,7 +545,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
|
||||
int error;
|
||||
|
||||
mutex_lock(&ctrl->ana_lock);
|
||||
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
|
||||
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
|
||||
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
|
||||
if (error) {
|
||||
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
|
||||
@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
|
||||
|
||||
if (ctrl->state != NVME_CTRL_LIVE)
|
||||
return;
|
||||
|
||||
nvme_read_ana_log(ctrl);
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,14 @@ extern unsigned int admin_timeout;
|
||||
#define NVME_INLINE_METADATA_SG_CNT 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Default to a 4K page size, with the intention to update this
|
||||
* path in the future to accommodate architectures with differing
|
||||
* kernel and IO page sizes.
|
||||
*/
|
||||
#define NVME_CTRL_PAGE_SHIFT 12
|
||||
#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT)
|
||||
|
||||
extern struct workqueue_struct *nvme_wq;
|
||||
extern struct workqueue_struct *nvme_reset_wq;
|
||||
extern struct workqueue_struct *nvme_delete_wq;
|
||||
@ -180,12 +188,32 @@ static inline u16 nvme_req_qid(struct request *req)
|
||||
*/
|
||||
#define NVME_QUIRK_DELAY_AMOUNT 2300
|
||||
|
||||
/*
|
||||
* enum nvme_ctrl_state: Controller state
|
||||
*
|
||||
* @NVME_CTRL_NEW: New controller just allocated, initial state
|
||||
* @NVME_CTRL_LIVE: Controller is connected and I/O capable
|
||||
* @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset)
|
||||
* @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the
|
||||
* transport
|
||||
* @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion)
|
||||
* @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not
|
||||
* disabled/failed immediately. This state comes
|
||||
* after all async event processing took place and
|
||||
* before ns removal and the controller deletion
|
||||
* progress
|
||||
* @NVME_CTRL_DEAD: Controller is non-present/unresponsive during
|
||||
* shutdown or removal. In this case we forcibly
|
||||
* kill all inflight I/O as they have no chance to
|
||||
* complete
|
||||
*/
|
||||
enum nvme_ctrl_state {
|
||||
NVME_CTRL_NEW,
|
||||
NVME_CTRL_LIVE,
|
||||
NVME_CTRL_RESETTING,
|
||||
NVME_CTRL_CONNECTING,
|
||||
NVME_CTRL_DELETING,
|
||||
NVME_CTRL_DELETING_NOIO,
|
||||
NVME_CTRL_DEAD,
|
||||
};
|
||||
|
||||
@ -198,6 +226,12 @@ struct nvme_fault_inject {
|
||||
#endif
|
||||
};
|
||||
|
||||
struct nvme_cel {
|
||||
struct list_head entry;
|
||||
struct nvme_effects_log log;
|
||||
u8 csi;
|
||||
};
|
||||
|
||||
struct nvme_ctrl {
|
||||
bool comp_seen;
|
||||
enum nvme_ctrl_state state;
|
||||
@ -235,10 +269,12 @@ struct nvme_ctrl {
|
||||
u32 queue_count;
|
||||
|
||||
u64 cap;
|
||||
u32 page_size;
|
||||
u32 max_hw_sectors;
|
||||
u32 max_segments;
|
||||
u32 max_integrity_segments;
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
u32 max_zone_append;
|
||||
#endif
|
||||
u16 crdt[3];
|
||||
u16 oncs;
|
||||
u16 oacs;
|
||||
@ -264,6 +300,7 @@ struct nvme_ctrl {
|
||||
unsigned long quirks;
|
||||
struct nvme_id_power_state psd[32];
|
||||
struct nvme_effects_log *effects;
|
||||
struct list_head cels;
|
||||
struct work_struct scan_work;
|
||||
struct work_struct async_event_work;
|
||||
struct delayed_work ka_work;
|
||||
@ -346,6 +383,7 @@ struct nvme_ns_ids {
|
||||
u8 eui64[8];
|
||||
u8 nguid[16];
|
||||
uuid_t uuid;
|
||||
u8 csi;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -365,6 +403,7 @@ struct nvme_ns_head {
|
||||
struct kref ref;
|
||||
bool shared;
|
||||
int instance;
|
||||
struct nvme_effects_log *effects;
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
struct gendisk *disk;
|
||||
struct bio_list requeue_list;
|
||||
@ -402,6 +441,9 @@ struct nvme_ns {
|
||||
u16 sgs;
|
||||
u32 sws;
|
||||
u8 pi_type;
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
u64 zsze;
|
||||
#endif
|
||||
unsigned long features;
|
||||
unsigned long flags;
|
||||
#define NVME_NS_REMOVING 0
|
||||
@ -567,8 +609,11 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
|
||||
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
|
||||
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
|
||||
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
|
||||
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
|
||||
void *log, size_t size, u64 offset);
|
||||
struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
|
||||
struct nvme_ns_head **head, int *srcu_idx);
|
||||
void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
|
||||
|
||||
extern const struct attribute_group *nvme_ns_id_attr_groups[];
|
||||
extern const struct block_device_operations nvme_ns_head_ops;
|
||||
@ -704,6 +749,36 @@ static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
|
||||
}
|
||||
#endif /* CONFIG_NVME_MULTIPATH */
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
|
||||
unsigned lbaf);
|
||||
|
||||
int nvme_report_zones(struct gendisk *disk, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data);
|
||||
|
||||
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
|
||||
struct nvme_command *cmnd,
|
||||
enum nvme_zone_mgmt_action action);
|
||||
#else
|
||||
#define nvme_report_zones NULL
|
||||
|
||||
static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
|
||||
struct request *req, struct nvme_command *cmnd,
|
||||
enum nvme_zone_mgmt_action action)
|
||||
{
|
||||
return BLK_STS_NOTSUPP;
|
||||
}
|
||||
|
||||
static inline int nvme_update_zone_info(struct gendisk *disk,
|
||||
struct nvme_ns *ns,
|
||||
unsigned lbaf)
|
||||
{
|
||||
dev_warn(ns->ctrl->device,
|
||||
"Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
|
||||
return -EPROTONOSUPPORT;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NVM
|
||||
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
|
||||
void nvme_nvm_unregister(struct nvme_ns *ns);
|
||||
@ -735,4 +810,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl);
|
||||
static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
|
||||
#endif
|
||||
|
||||
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
u8 opcode);
|
||||
void nvme_execute_passthru_rq(struct request *rq);
|
||||
struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path);
|
||||
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
|
||||
void nvme_put_ns(struct nvme_ns *ns);
|
||||
|
||||
#endif /* _NVME_H */
|
||||
|
@ -4,6 +4,7 @@
|
||||
* Copyright (c) 2011-2014, Intel Corporation.
|
||||
*/
|
||||
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/aer.h>
|
||||
#include <linux/async.h>
|
||||
#include <linux/blkdev.h>
|
||||
@ -61,10 +62,10 @@ MODULE_PARM_DESC(sgl_threshold,
|
||||
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
|
||||
static const struct kernel_param_ops io_queue_depth_ops = {
|
||||
.set = io_queue_depth_set,
|
||||
.get = param_get_int,
|
||||
.get = param_get_uint,
|
||||
};
|
||||
|
||||
static int io_queue_depth = 1024;
|
||||
static unsigned int io_queue_depth = 1024;
|
||||
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
|
||||
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
|
||||
|
||||
@ -94,6 +95,10 @@ static unsigned int poll_queues;
|
||||
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
|
||||
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
|
||||
|
||||
static bool noacpi;
|
||||
module_param(noacpi, bool, 0444);
|
||||
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
|
||||
|
||||
struct nvme_dev;
|
||||
struct nvme_queue;
|
||||
|
||||
@ -115,7 +120,7 @@ struct nvme_dev {
|
||||
unsigned max_qid;
|
||||
unsigned io_queues[HCTX_MAX_TYPES];
|
||||
unsigned int num_vecs;
|
||||
int q_depth;
|
||||
u16 q_depth;
|
||||
int io_sqes;
|
||||
u32 db_stride;
|
||||
void __iomem *bar;
|
||||
@ -151,13 +156,14 @@ struct nvme_dev {
|
||||
|
||||
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
|
||||
{
|
||||
int n = 0, ret;
|
||||
int ret;
|
||||
u16 n;
|
||||
|
||||
ret = kstrtoint(val, 10, &n);
|
||||
ret = kstrtou16(val, 10, &n);
|
||||
if (ret != 0 || n < 2)
|
||||
return -EINVAL;
|
||||
|
||||
return param_set_int(val, kp);
|
||||
return param_set_ushort(val, kp);
|
||||
}
|
||||
|
||||
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
|
||||
@ -345,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
|
||||
* as it only leads to a small amount of wasted memory for the lifetime of
|
||||
* the I/O.
|
||||
*/
|
||||
static int nvme_npages(unsigned size, struct nvme_dev *dev)
|
||||
static int nvme_pci_npages_prp(void)
|
||||
{
|
||||
unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
|
||||
dev->ctrl.page_size);
|
||||
unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
|
||||
NVME_CTRL_PAGE_SIZE);
|
||||
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
|
||||
}
|
||||
|
||||
@ -356,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
|
||||
* Calculates the number of pages needed for the SGL segments. For example a 4k
|
||||
* page can accommodate 256 SGL descriptors.
|
||||
*/
|
||||
static int nvme_pci_npages_sgl(unsigned int num_seg)
|
||||
static int nvme_pci_npages_sgl(void)
|
||||
{
|
||||
return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
|
||||
return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
|
||||
PAGE_SIZE);
|
||||
}
|
||||
|
||||
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
|
||||
unsigned int size, unsigned int nseg, bool use_sgl)
|
||||
static size_t nvme_pci_iod_alloc_size(void)
|
||||
{
|
||||
size_t alloc_size;
|
||||
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
|
||||
|
||||
if (use_sgl)
|
||||
alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
|
||||
else
|
||||
alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
|
||||
|
||||
return alloc_size + sizeof(struct scatterlist) * nseg;
|
||||
return sizeof(__le64 *) * npages +
|
||||
sizeof(struct scatterlist) * NVME_MAX_SEGS;
|
||||
}
|
||||
|
||||
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
|
||||
@ -500,9 +502,6 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
|
||||
int nseg = blk_rq_nr_phys_segments(req);
|
||||
unsigned int avg_seg_size;
|
||||
|
||||
if (nseg == 0)
|
||||
return false;
|
||||
|
||||
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
|
||||
|
||||
if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
|
||||
@ -517,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
|
||||
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
|
||||
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
|
||||
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
|
||||
int i;
|
||||
|
||||
@ -584,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
||||
struct scatterlist *sg = iod->sg;
|
||||
int dma_len = sg_dma_len(sg);
|
||||
u64 dma_addr = sg_dma_address(sg);
|
||||
u32 page_size = dev->ctrl.page_size;
|
||||
int offset = dma_addr & (page_size - 1);
|
||||
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
|
||||
__le64 *prp_list;
|
||||
void **list = nvme_pci_iod_list(req);
|
||||
dma_addr_t prp_dma;
|
||||
int nprps, i;
|
||||
|
||||
length -= (page_size - offset);
|
||||
length -= (NVME_CTRL_PAGE_SIZE - offset);
|
||||
if (length <= 0) {
|
||||
iod->first_dma = 0;
|
||||
goto done;
|
||||
}
|
||||
|
||||
dma_len -= (page_size - offset);
|
||||
dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
|
||||
if (dma_len) {
|
||||
dma_addr += (page_size - offset);
|
||||
dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
|
||||
} else {
|
||||
sg = sg_next(sg);
|
||||
dma_addr = sg_dma_address(sg);
|
||||
dma_len = sg_dma_len(sg);
|
||||
}
|
||||
|
||||
if (length <= page_size) {
|
||||
if (length <= NVME_CTRL_PAGE_SIZE) {
|
||||
iod->first_dma = dma_addr;
|
||||
goto done;
|
||||
}
|
||||
|
||||
nprps = DIV_ROUND_UP(length, page_size);
|
||||
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
|
||||
if (nprps <= (256 / 8)) {
|
||||
pool = dev->prp_small_pool;
|
||||
iod->npages = 0;
|
||||
@ -630,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
||||
iod->first_dma = prp_dma;
|
||||
i = 0;
|
||||
for (;;) {
|
||||
if (i == page_size >> 3) {
|
||||
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
|
||||
__le64 *old_prp_list = prp_list;
|
||||
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
|
||||
if (!prp_list)
|
||||
@ -641,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
|
||||
i = 1;
|
||||
}
|
||||
prp_list[i++] = cpu_to_le64(dma_addr);
|
||||
dma_len -= page_size;
|
||||
dma_addr += page_size;
|
||||
length -= page_size;
|
||||
dma_len -= NVME_CTRL_PAGE_SIZE;
|
||||
dma_addr += NVME_CTRL_PAGE_SIZE;
|
||||
length -= NVME_CTRL_PAGE_SIZE;
|
||||
if (length <= 0)
|
||||
break;
|
||||
if (dma_len > 0)
|
||||
@ -753,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
|
||||
struct bio_vec *bv)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
|
||||
unsigned int first_prp_len = dev->ctrl.page_size - offset;
|
||||
unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
|
||||
unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
|
||||
|
||||
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
|
||||
if (dma_mapping_error(dev->dev, iod->first_dma))
|
||||
@ -764,7 +762,7 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
|
||||
cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
|
||||
if (bv->bv_len > first_prp_len)
|
||||
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
|
||||
return 0;
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
|
||||
@ -782,7 +780,7 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
|
||||
cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
|
||||
cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
|
||||
cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
|
||||
return 0;
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
|
||||
@ -796,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
|
||||
struct bio_vec bv = req_bvec(req);
|
||||
|
||||
if (!is_pci_p2pdma_page(bv.bv_page)) {
|
||||
if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
|
||||
if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
|
||||
return nvme_setup_prp_simple(dev, req,
|
||||
&cmnd->rw, &bv);
|
||||
|
||||
@ -846,7 +844,7 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
|
||||
if (dma_mapping_error(dev->dev, iod->meta_dma))
|
||||
return BLK_STS_IOERR;
|
||||
cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
|
||||
return 0;
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1019,6 +1017,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
|
||||
static irqreturn_t nvme_irq_check(int irq, void *data)
|
||||
{
|
||||
struct nvme_queue *nvmeq = data;
|
||||
|
||||
if (nvme_cqe_pending(nvmeq))
|
||||
return IRQ_WAKE_THREAD;
|
||||
return IRQ_NONE;
|
||||
@ -1154,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t error)
|
||||
|
||||
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
|
||||
{
|
||||
|
||||
/* If true, indicates loss of adapter communication, possibly by a
|
||||
* NVMe Subsystem reset.
|
||||
*/
|
||||
@ -1261,9 +1259,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
|
||||
}
|
||||
|
||||
/*
|
||||
* Shutdown the controller immediately and schedule a reset if the
|
||||
* command was already aborted once before and still hasn't been
|
||||
* returned to the driver, or if this is the admin queue.
|
||||
* Shutdown the controller immediately and schedule a reset if the
|
||||
* command was already aborted once before and still hasn't been
|
||||
* returned to the driver, or if this is the admin queue.
|
||||
*/
|
||||
if (!nvmeq->qid || iod->aborted) {
|
||||
dev_warn(dev->ctrl.device,
|
||||
@ -1398,11 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
|
||||
{
|
||||
int q_depth = dev->q_depth;
|
||||
unsigned q_size_aligned = roundup(q_depth * entry_size,
|
||||
dev->ctrl.page_size);
|
||||
NVME_CTRL_PAGE_SIZE);
|
||||
|
||||
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
|
||||
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
|
||||
mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
|
||||
|
||||
mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
|
||||
q_depth = div_u64(mem_per_q, entry_size);
|
||||
|
||||
/*
|
||||
@ -1817,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
|
||||
|
||||
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
|
||||
{
|
||||
u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
|
||||
u64 dma_addr = dev->host_mem_descs_dma;
|
||||
struct nvme_command c;
|
||||
int ret;
|
||||
@ -1825,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
|
||||
c.features.opcode = nvme_admin_set_features;
|
||||
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
|
||||
c.features.dword11 = cpu_to_le32(bits);
|
||||
c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
|
||||
ilog2(dev->ctrl.page_size));
|
||||
c.features.dword12 = cpu_to_le32(host_mem_size);
|
||||
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
|
||||
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
|
||||
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
|
||||
@ -1846,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
|
||||
|
||||
for (i = 0; i < dev->nr_host_mem_descs; i++) {
|
||||
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
|
||||
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
|
||||
size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
|
||||
|
||||
dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
|
||||
le64_to_cpu(desc->addr),
|
||||
@ -1898,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
|
||||
break;
|
||||
|
||||
descs[i].addr = cpu_to_le64(dma_addr);
|
||||
descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
|
||||
descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
|
||||
i++;
|
||||
}
|
||||
|
||||
@ -1914,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
|
||||
|
||||
out_free_bufs:
|
||||
while (--i >= 0) {
|
||||
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
|
||||
size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
|
||||
|
||||
dma_free_attrs(dev->dev, size, bufs[i],
|
||||
le64_to_cpu(descs[i].addr),
|
||||
@ -1932,12 +1931,12 @@ out:
|
||||
|
||||
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
|
||||
{
|
||||
u32 chunk_size;
|
||||
u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
|
||||
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
|
||||
u64 chunk_size;
|
||||
|
||||
/* start big and work our way down */
|
||||
for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
|
||||
chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
|
||||
chunk_size /= 2) {
|
||||
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
|
||||
if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
|
||||
if (!min || dev->host_mem_size >= min)
|
||||
return 0;
|
||||
@ -2003,7 +2002,7 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
|
||||
unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
|
||||
|
||||
/*
|
||||
* If there is no interupt available for queues, ensure that
|
||||
* If there is no interrupt available for queues, ensure that
|
||||
* the default queue is set to 1. The affinity set size is
|
||||
* also set to one, but the irq core ignores it for this case.
|
||||
*
|
||||
@ -2261,8 +2260,8 @@ static void nvme_dev_add(struct nvme_dev *dev)
|
||||
dev->tagset.nr_maps++;
|
||||
dev->tagset.timeout = NVME_IO_TIMEOUT;
|
||||
dev->tagset.numa_node = dev->ctrl.numa_node;
|
||||
dev->tagset.queue_depth =
|
||||
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
|
||||
dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
|
||||
BLK_MQ_MAX_DEPTH) - 1;
|
||||
dev->tagset.cmd_size = sizeof(struct nvme_iod);
|
||||
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
|
||||
dev->tagset.driver_data = dev;
|
||||
@ -2321,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
|
||||
|
||||
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
|
||||
|
||||
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
|
||||
dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1,
|
||||
io_queue_depth);
|
||||
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
|
||||
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
|
||||
@ -2760,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ACPI
|
||||
static bool nvme_acpi_storage_d3(struct pci_dev *dev)
|
||||
{
|
||||
struct acpi_device *adev;
|
||||
struct pci_dev *root;
|
||||
acpi_handle handle;
|
||||
acpi_status status;
|
||||
u8 val;
|
||||
|
||||
/*
|
||||
* Look for _DSD property specifying that the storage device on the port
|
||||
* must use D3 to support deep platform power savings during
|
||||
* suspend-to-idle.
|
||||
*/
|
||||
root = pcie_find_root_port(dev);
|
||||
if (!root)
|
||||
return false;
|
||||
|
||||
adev = ACPI_COMPANION(&root->dev);
|
||||
if (!adev)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The property is defined in the PXSX device for South complex ports
|
||||
* and in the PEGP device for North complex ports.
|
||||
*/
|
||||
status = acpi_get_handle(adev->handle, "PXSX", &handle);
|
||||
if (ACPI_FAILURE(status)) {
|
||||
status = acpi_get_handle(adev->handle, "PEGP", &handle);
|
||||
if (ACPI_FAILURE(status))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (acpi_bus_get_device(handle, &adev))
|
||||
return false;
|
||||
|
||||
if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
|
||||
&val))
|
||||
return false;
|
||||
return val == 1;
|
||||
}
|
||||
#else
|
||||
static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_ACPI */
|
||||
|
||||
static void nvme_async_probe(void *data, async_cookie_t cookie)
|
||||
{
|
||||
struct nvme_dev *dev = data;
|
||||
@ -2809,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
|
||||
quirks |= check_vendor_combination_bug(pdev);
|
||||
|
||||
if (!noacpi && nvme_acpi_storage_d3(pdev)) {
|
||||
/*
|
||||
* Some systems use a bios work around to ask for D3 on
|
||||
* platforms that support kernel managed suspend.
|
||||
*/
|
||||
dev_info(&pdev->dev,
|
||||
"platform quirk: setting simple suspend\n");
|
||||
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
|
||||
}
|
||||
|
||||
/*
|
||||
* Double check that our mempool alloc size will cover the biggest
|
||||
* command we support.
|
||||
*/
|
||||
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
|
||||
NVME_MAX_SEGS, true);
|
||||
alloc_size = nvme_pci_iod_alloc_size();
|
||||
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
|
||||
|
||||
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
|
||||
@ -2876,6 +2932,7 @@ static void nvme_reset_done(struct pci_dev *pdev)
|
||||
static void nvme_shutdown(struct pci_dev *pdev)
|
||||
{
|
||||
struct nvme_dev *dev = pci_get_drvdata(pdev);
|
||||
|
||||
nvme_disable_prepare_reset(dev, true);
|
||||
}
|
||||
|
||||
@ -3006,6 +3063,7 @@ unfreeze:
|
||||
static int nvme_simple_suspend(struct device *dev)
|
||||
{
|
||||
struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
|
||||
|
||||
return nvme_disable_prepare_reset(ndev, true);
|
||||
}
|
||||
|
||||
@ -3079,16 +3137,16 @@ static const struct pci_error_handlers nvme_err_handler = {
|
||||
};
|
||||
|
||||
static const struct pci_device_id nvme_id_table[] = {
|
||||
{ PCI_VDEVICE(INTEL, 0x0953),
|
||||
{ PCI_VDEVICE(INTEL, 0x0953), /* Intel 750/P3500/P3600/P3700 */
|
||||
.driver_data = NVME_QUIRK_STRIPE_SIZE |
|
||||
NVME_QUIRK_DEALLOCATE_ZEROES, },
|
||||
{ PCI_VDEVICE(INTEL, 0x0a53),
|
||||
{ PCI_VDEVICE(INTEL, 0x0a53), /* Intel P3520 */
|
||||
.driver_data = NVME_QUIRK_STRIPE_SIZE |
|
||||
NVME_QUIRK_DEALLOCATE_ZEROES, },
|
||||
{ PCI_VDEVICE(INTEL, 0x0a54),
|
||||
{ PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */
|
||||
.driver_data = NVME_QUIRK_STRIPE_SIZE |
|
||||
NVME_QUIRK_DEALLOCATE_ZEROES, },
|
||||
{ PCI_VDEVICE(INTEL, 0x0a55),
|
||||
{ PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */
|
||||
.driver_data = NVME_QUIRK_STRIPE_SIZE |
|
||||
NVME_QUIRK_DEALLOCATE_ZEROES, },
|
||||
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
|
||||
|
@ -96,6 +96,7 @@ struct nvme_rdma_queue {
|
||||
int cm_error;
|
||||
struct completion cm_done;
|
||||
bool pi_support;
|
||||
int cq_size;
|
||||
};
|
||||
|
||||
struct nvme_rdma_ctrl {
|
||||
@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
|
||||
init_attr.recv_cq = queue->ib_cq;
|
||||
if (queue->pi_support)
|
||||
init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
|
||||
init_attr.qp_context = queue;
|
||||
|
||||
ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
|
||||
|
||||
@ -409,6 +411,14 @@ out_err:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
|
||||
{
|
||||
if (nvme_rdma_poll_queue(queue))
|
||||
ib_free_cq(queue->ib_cq);
|
||||
else
|
||||
ib_cq_pool_put(queue->ib_cq, queue->cq_size);
|
||||
}
|
||||
|
||||
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
|
||||
{
|
||||
struct nvme_rdma_device *dev;
|
||||
@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
|
||||
* the destruction of the QP shouldn't use rdma_cm API.
|
||||
*/
|
||||
ib_destroy_qp(queue->qp);
|
||||
ib_free_cq(queue->ib_cq);
|
||||
nvme_rdma_free_cq(queue);
|
||||
|
||||
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
|
||||
sizeof(struct nvme_completion), DMA_FROM_DEVICE);
|
||||
@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
|
||||
return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
|
||||
}
|
||||
|
||||
static int nvme_rdma_create_cq(struct ib_device *ibdev,
|
||||
struct nvme_rdma_queue *queue)
|
||||
{
|
||||
int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
|
||||
enum ib_poll_context poll_ctx;
|
||||
|
||||
/*
|
||||
* Spread I/O queues completion vectors according their queue index.
|
||||
* Admin queues can always go on completion vector 0.
|
||||
*/
|
||||
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
|
||||
|
||||
/* Polling queues need direct cq polling context */
|
||||
if (nvme_rdma_poll_queue(queue)) {
|
||||
poll_ctx = IB_POLL_DIRECT;
|
||||
queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
|
||||
comp_vector, poll_ctx);
|
||||
} else {
|
||||
poll_ctx = IB_POLL_SOFTIRQ;
|
||||
queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
|
||||
comp_vector, poll_ctx);
|
||||
}
|
||||
|
||||
if (IS_ERR(queue->ib_cq)) {
|
||||
ret = PTR_ERR(queue->ib_cq);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
|
||||
{
|
||||
struct ib_device *ibdev;
|
||||
const int send_wr_factor = 3; /* MR, SEND, INV */
|
||||
const int cq_factor = send_wr_factor + 1; /* + RECV */
|
||||
int comp_vector, idx = nvme_rdma_queue_idx(queue);
|
||||
enum ib_poll_context poll_ctx;
|
||||
int ret, pages_per_mr;
|
||||
|
||||
queue->device = nvme_rdma_find_get_device(queue->cm_id);
|
||||
@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
|
||||
}
|
||||
ibdev = queue->device->dev;
|
||||
|
||||
/*
|
||||
* Spread I/O queues completion vectors according their queue index.
|
||||
* Admin queues can always go on completion vector 0.
|
||||
*/
|
||||
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
|
||||
|
||||
/* Polling queues need direct cq polling context */
|
||||
if (nvme_rdma_poll_queue(queue))
|
||||
poll_ctx = IB_POLL_DIRECT;
|
||||
else
|
||||
poll_ctx = IB_POLL_SOFTIRQ;
|
||||
|
||||
/* +1 for ib_stop_cq */
|
||||
queue->ib_cq = ib_alloc_cq(ibdev, queue,
|
||||
cq_factor * queue->queue_size + 1,
|
||||
comp_vector, poll_ctx);
|
||||
if (IS_ERR(queue->ib_cq)) {
|
||||
ret = PTR_ERR(queue->ib_cq);
|
||||
queue->cq_size = cq_factor * queue->queue_size + 1;
|
||||
|
||||
ret = nvme_rdma_create_cq(ibdev, queue);
|
||||
if (ret)
|
||||
goto out_put_dev;
|
||||
}
|
||||
|
||||
ret = nvme_rdma_create_qp(queue, send_wr_factor);
|
||||
if (ret)
|
||||
@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
|
||||
if (ret) {
|
||||
dev_err(queue->ctrl->ctrl.device,
|
||||
"failed to initialize MR pool sized %d for QID %d\n",
|
||||
queue->queue_size, idx);
|
||||
queue->queue_size, nvme_rdma_queue_idx(queue));
|
||||
goto out_destroy_ring;
|
||||
}
|
||||
|
||||
@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
|
||||
if (ret) {
|
||||
dev_err(queue->ctrl->ctrl.device,
|
||||
"failed to initialize PI MR pool sized %d for QID %d\n",
|
||||
queue->queue_size, idx);
|
||||
queue->queue_size, nvme_rdma_queue_idx(queue));
|
||||
goto out_destroy_mr_pool;
|
||||
}
|
||||
}
|
||||
@ -540,7 +565,7 @@ out_destroy_ring:
|
||||
out_destroy_qp:
|
||||
rdma_destroy_qp(queue->cm_id);
|
||||
out_destroy_ib_cq:
|
||||
ib_free_cq(queue->ib_cq);
|
||||
nvme_rdma_free_cq(queue);
|
||||
out_put_dev:
|
||||
nvme_rdma_dev_put(queue->device);
|
||||
return ret;
|
||||
@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
|
||||
ret = PTR_ERR(ctrl->ctrl.connect_q);
|
||||
goto out_free_tag_set;
|
||||
}
|
||||
} else {
|
||||
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
|
||||
ctrl->ctrl.queue_count - 1);
|
||||
}
|
||||
|
||||
ret = nvme_rdma_start_io_queues(ctrl);
|
||||
if (ret)
|
||||
goto out_cleanup_connect_q;
|
||||
|
||||
if (!new) {
|
||||
nvme_start_queues(&ctrl->ctrl);
|
||||
nvme_wait_freeze(&ctrl->ctrl);
|
||||
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
|
||||
ctrl->ctrl.queue_count - 1);
|
||||
nvme_unfreeze(&ctrl->ctrl);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_cleanup_connect_q:
|
||||
@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
|
||||
bool remove)
|
||||
{
|
||||
if (ctrl->ctrl.queue_count > 1) {
|
||||
nvme_start_freeze(&ctrl->ctrl);
|
||||
nvme_stop_queues(&ctrl->ctrl);
|
||||
nvme_rdma_stop_io_queues(ctrl);
|
||||
if (ctrl->ctrl.tagset) {
|
||||
@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
|
||||
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
|
||||
if (!changed) {
|
||||
/*
|
||||
* state change failure is ok if we're in DELETING state,
|
||||
* state change failure is ok if we started ctrl delete,
|
||||
* unless we're during creation of a new controller to
|
||||
* avoid races with teardown flow.
|
||||
*/
|
||||
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
|
||||
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
|
||||
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
|
||||
WARN_ON_ONCE(new);
|
||||
ret = -EINVAL;
|
||||
goto destroy_io;
|
||||
@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
|
||||
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
|
||||
|
||||
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
|
||||
/* state change failure is ok if we're in DELETING state */
|
||||
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
|
||||
/* state change failure is ok if we started ctrl delete */
|
||||
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
|
||||
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
|
||||
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
|
||||
const char *op)
|
||||
{
|
||||
struct nvme_rdma_queue *queue = cq->cq_context;
|
||||
struct nvme_rdma_queue *queue = wc->qp->qp_context;
|
||||
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
|
||||
|
||||
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
|
||||
@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct nvme_rdma_qe *qe =
|
||||
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
|
||||
struct nvme_rdma_queue *queue = cq->cq_context;
|
||||
struct nvme_rdma_queue *queue = wc->qp->qp_context;
|
||||
struct ib_device *ibdev = queue->device->dev;
|
||||
struct nvme_completion *cqe = qe->data;
|
||||
const size_t len = sizeof(struct nvme_completion);
|
||||
|
@ -46,6 +46,7 @@ struct nvme_tcp_request {
|
||||
u32 pdu_sent;
|
||||
u16 ttag;
|
||||
struct list_head entry;
|
||||
struct llist_node lentry;
|
||||
__le32 ddgst;
|
||||
|
||||
struct bio *curr_bio;
|
||||
@ -75,9 +76,10 @@ struct nvme_tcp_queue {
|
||||
struct work_struct io_work;
|
||||
int io_cpu;
|
||||
|
||||
spinlock_t lock;
|
||||
struct mutex send_mutex;
|
||||
struct llist_head req_list;
|
||||
struct list_head send_list;
|
||||
bool more_requests;
|
||||
|
||||
/* recv state */
|
||||
void *pdu;
|
||||
@ -261,15 +263,13 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
|
||||
}
|
||||
|
||||
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
|
||||
bool sync)
|
||||
bool sync, bool last)
|
||||
{
|
||||
struct nvme_tcp_queue *queue = req->queue;
|
||||
bool empty;
|
||||
|
||||
spin_lock(&queue->lock);
|
||||
empty = list_empty(&queue->send_list) && !queue->request;
|
||||
list_add_tail(&req->entry, &queue->send_list);
|
||||
spin_unlock(&queue->lock);
|
||||
empty = llist_add(&req->lentry, &queue->req_list) &&
|
||||
list_empty(&queue->send_list) && !queue->request;
|
||||
|
||||
/*
|
||||
* if we're the first on the send_list and we can try to send
|
||||
@ -278,25 +278,42 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
|
||||
*/
|
||||
if (queue->io_cpu == smp_processor_id() &&
|
||||
sync && empty && mutex_trylock(&queue->send_mutex)) {
|
||||
queue->more_requests = !last;
|
||||
nvme_tcp_try_send(queue);
|
||||
queue->more_requests = false;
|
||||
mutex_unlock(&queue->send_mutex);
|
||||
} else {
|
||||
} else if (last) {
|
||||
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
||||
}
|
||||
}
|
||||
|
||||
static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
struct nvme_tcp_request *req;
|
||||
struct llist_node *node;
|
||||
|
||||
for (node = llist_del_all(&queue->req_list); node; node = node->next) {
|
||||
req = llist_entry(node, struct nvme_tcp_request, lentry);
|
||||
list_add(&req->entry, &queue->send_list);
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct nvme_tcp_request *
|
||||
nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
struct nvme_tcp_request *req;
|
||||
|
||||
spin_lock(&queue->lock);
|
||||
req = list_first_entry_or_null(&queue->send_list,
|
||||
struct nvme_tcp_request, entry);
|
||||
if (req)
|
||||
list_del(&req->entry);
|
||||
spin_unlock(&queue->lock);
|
||||
if (!req) {
|
||||
nvme_tcp_process_req_list(queue);
|
||||
req = list_first_entry_or_null(&queue->send_list,
|
||||
struct nvme_tcp_request, entry);
|
||||
if (unlikely(!req))
|
||||
return NULL;
|
||||
}
|
||||
|
||||
list_del(&req->entry);
|
||||
return req;
|
||||
}
|
||||
|
||||
@ -596,7 +613,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
|
||||
req->state = NVME_TCP_SEND_H2C_PDU;
|
||||
req->offset = 0;
|
||||
|
||||
nvme_tcp_queue_request(req, false);
|
||||
nvme_tcp_queue_request(req, false, true);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -863,6 +880,12 @@ done:
|
||||
read_unlock(&sk->sk_callback_lock);
|
||||
}
|
||||
|
||||
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
return !list_empty(&queue->send_list) ||
|
||||
!llist_empty(&queue->req_list) || queue->more_requests;
|
||||
}
|
||||
|
||||
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
|
||||
{
|
||||
queue->request = NULL;
|
||||
@ -884,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
|
||||
bool last = nvme_tcp_pdu_last_send(req, len);
|
||||
int ret, flags = MSG_DONTWAIT;
|
||||
|
||||
if (last && !queue->data_digest)
|
||||
if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
|
||||
flags |= MSG_EOR;
|
||||
else
|
||||
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
|
||||
@ -931,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
|
||||
int flags = MSG_DONTWAIT;
|
||||
int ret;
|
||||
|
||||
if (inline_data)
|
||||
if (inline_data || nvme_tcp_queue_more(queue))
|
||||
flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
|
||||
else
|
||||
flags |= MSG_EOR;
|
||||
@ -996,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
|
||||
{
|
||||
struct nvme_tcp_queue *queue = req->queue;
|
||||
int ret;
|
||||
struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
|
||||
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
|
||||
struct kvec iov = {
|
||||
.iov_base = &req->ddgst + req->offset,
|
||||
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
|
||||
};
|
||||
|
||||
if (nvme_tcp_queue_more(queue))
|
||||
msg.msg_flags |= MSG_MORE;
|
||||
else
|
||||
msg.msg_flags |= MSG_EOR;
|
||||
|
||||
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
|
||||
if (unlikely(ret <= 0))
|
||||
return ret;
|
||||
@ -1344,8 +1372,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
|
||||
int ret, rcv_pdu_size;
|
||||
|
||||
queue->ctrl = ctrl;
|
||||
init_llist_head(&queue->req_list);
|
||||
INIT_LIST_HEAD(&queue->send_list);
|
||||
spin_lock_init(&queue->lock);
|
||||
mutex_init(&queue->send_mutex);
|
||||
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
|
||||
queue->queue_size = queue_size;
|
||||
@ -1746,15 +1774,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
|
||||
ret = PTR_ERR(ctrl->connect_q);
|
||||
goto out_free_tag_set;
|
||||
}
|
||||
} else {
|
||||
blk_mq_update_nr_hw_queues(ctrl->tagset,
|
||||
ctrl->queue_count - 1);
|
||||
}
|
||||
|
||||
ret = nvme_tcp_start_io_queues(ctrl);
|
||||
if (ret)
|
||||
goto out_cleanup_connect_q;
|
||||
|
||||
if (!new) {
|
||||
nvme_start_queues(ctrl);
|
||||
nvme_wait_freeze(ctrl);
|
||||
blk_mq_update_nr_hw_queues(ctrl->tagset,
|
||||
ctrl->queue_count - 1);
|
||||
nvme_unfreeze(ctrl);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_cleanup_connect_q:
|
||||
@ -1859,6 +1892,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
|
||||
{
|
||||
if (ctrl->queue_count <= 1)
|
||||
return;
|
||||
nvme_start_freeze(ctrl);
|
||||
nvme_stop_queues(ctrl);
|
||||
nvme_tcp_stop_io_queues(ctrl);
|
||||
if (ctrl->tagset) {
|
||||
@ -1925,11 +1959,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
|
||||
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
|
||||
/*
|
||||
* state change failure is ok if we're in DELETING state,
|
||||
* state change failure is ok if we started ctrl delete,
|
||||
* unless we're during creation of a new controller to
|
||||
* avoid races with teardown flow.
|
||||
*/
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
||||
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
||||
WARN_ON_ONCE(new);
|
||||
ret = -EINVAL;
|
||||
goto destroy_io;
|
||||
@ -1985,8 +2020,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
|
||||
blk_mq_unquiesce_queue(ctrl->admin_q);
|
||||
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
|
||||
/* state change failure is ok if we're in DELETING state */
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
|
||||
/* state change failure is ok if we started ctrl delete */
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
||||
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -2021,8 +2057,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
|
||||
nvme_tcp_teardown_ctrl(ctrl, false);
|
||||
|
||||
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
|
||||
/* state change failure is ok if we're in DELETING state */
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
|
||||
/* state change failure is ok if we started ctrl delete */
|
||||
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
|
||||
ctrl->state != NVME_CTRL_DELETING_NOIO);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -2109,7 +2146,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
|
||||
ctrl->async_req.curr_bio = NULL;
|
||||
ctrl->async_req.data_len = 0;
|
||||
|
||||
nvme_tcp_queue_request(&ctrl->async_req, true);
|
||||
nvme_tcp_queue_request(&ctrl->async_req, true, true);
|
||||
}
|
||||
|
||||
static enum blk_eh_timer_return
|
||||
@ -2221,6 +2258,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct nvme_tcp_queue *queue = hctx->driver_data;
|
||||
|
||||
if (!llist_empty(&queue->req_list))
|
||||
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
|
||||
}
|
||||
|
||||
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
const struct blk_mq_queue_data *bd)
|
||||
{
|
||||
@ -2240,7 +2285,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
|
||||
blk_mq_start_request(rq);
|
||||
|
||||
nvme_tcp_queue_request(req, true);
|
||||
nvme_tcp_queue_request(req, true, bd->last);
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
@ -2308,6 +2353,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
|
||||
|
||||
static const struct blk_mq_ops nvme_tcp_mq_ops = {
|
||||
.queue_rq = nvme_tcp_queue_rq,
|
||||
.commit_rqs = nvme_tcp_commit_rqs,
|
||||
.complete = nvme_complete_rq,
|
||||
.init_request = nvme_tcp_init_request,
|
||||
.exit_request = nvme_tcp_exit_request,
|
||||
|
256
drivers/nvme/host/zns.c
Normal file
256
drivers/nvme/host/zns.c
Normal file
@ -0,0 +1,256 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2020 Western Digital Corporation or its affiliates.
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include "nvme.h"
|
||||
|
||||
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
struct nvme_id_ctrl_zns *id;
|
||||
int status;
|
||||
|
||||
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return -ENOMEM;
|
||||
|
||||
c.identify.opcode = nvme_admin_identify;
|
||||
c.identify.cns = NVME_ID_CNS_CS_CTRL;
|
||||
c.identify.csi = NVME_CSI_ZNS;
|
||||
|
||||
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
|
||||
if (status) {
|
||||
kfree(id);
|
||||
return status;
|
||||
}
|
||||
|
||||
if (id->zasl)
|
||||
ctrl->max_zone_append = 1 << (id->zasl + 3);
|
||||
else
|
||||
ctrl->max_zone_append = ctrl->max_hw_sectors;
|
||||
kfree(id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
|
||||
unsigned lbaf)
|
||||
{
|
||||
struct nvme_effects_log *log = ns->head->effects;
|
||||
struct request_queue *q = disk->queue;
|
||||
struct nvme_command c = { };
|
||||
struct nvme_id_ns_zns *id;
|
||||
int status;
|
||||
|
||||
/* Driver requires zone append support */
|
||||
if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
|
||||
NVME_CMD_EFFECTS_CSUPP)) {
|
||||
dev_warn(ns->ctrl->device,
|
||||
"append not supported for zoned namespace:%d\n",
|
||||
ns->head->ns_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Lazily query controller append limit for the first zoned namespace */
|
||||
if (!ns->ctrl->max_zone_append) {
|
||||
status = nvme_set_max_append(ns->ctrl);
|
||||
if (status)
|
||||
return status;
|
||||
}
|
||||
|
||||
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return -ENOMEM;
|
||||
|
||||
c.identify.opcode = nvme_admin_identify;
|
||||
c.identify.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
c.identify.cns = NVME_ID_CNS_CS_NS;
|
||||
c.identify.csi = NVME_CSI_ZNS;
|
||||
|
||||
status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
|
||||
if (status)
|
||||
goto free_data;
|
||||
|
||||
/*
|
||||
* We currently do not handle devices requiring any of the zoned
|
||||
* operation characteristics.
|
||||
*/
|
||||
if (id->zoc) {
|
||||
dev_warn(ns->ctrl->device,
|
||||
"zone operations:%x not supported for namespace:%u\n",
|
||||
le16_to_cpu(id->zoc), ns->head->ns_id);
|
||||
status = -EINVAL;
|
||||
goto free_data;
|
||||
}
|
||||
|
||||
ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
|
||||
if (!is_power_of_2(ns->zsze)) {
|
||||
dev_warn(ns->ctrl->device,
|
||||
"invalid zone size:%llu for namespace:%u\n",
|
||||
ns->zsze, ns->head->ns_id);
|
||||
status = -EINVAL;
|
||||
goto free_data;
|
||||
}
|
||||
|
||||
q->limits.zoned = BLK_ZONED_HM;
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
|
||||
blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
|
||||
free_data:
|
||||
kfree(id);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
|
||||
unsigned int nr_zones, size_t *buflen)
|
||||
{
|
||||
struct request_queue *q = ns->disk->queue;
|
||||
size_t bufsize;
|
||||
void *buf;
|
||||
|
||||
const size_t min_bufsize = sizeof(struct nvme_zone_report) +
|
||||
sizeof(struct nvme_zone_descriptor);
|
||||
|
||||
nr_zones = min_t(unsigned int, nr_zones,
|
||||
get_capacity(ns->disk) >> ilog2(ns->zsze));
|
||||
|
||||
bufsize = sizeof(struct nvme_zone_report) +
|
||||
nr_zones * sizeof(struct nvme_zone_descriptor);
|
||||
bufsize = min_t(size_t, bufsize,
|
||||
queue_max_hw_sectors(q) << SECTOR_SHIFT);
|
||||
bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
|
||||
|
||||
while (bufsize >= min_bufsize) {
|
||||
buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
|
||||
if (buf) {
|
||||
*buflen = bufsize;
|
||||
return buf;
|
||||
}
|
||||
bufsize >>= 1;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
|
||||
struct nvme_zone_report *report,
|
||||
size_t buflen)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
int ret;
|
||||
|
||||
c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
|
||||
c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
|
||||
c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
|
||||
c.zmr.zra = NVME_ZRA_ZONE_REPORT;
|
||||
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
|
||||
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
|
||||
|
||||
ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return le64_to_cpu(report->nr_zones);
|
||||
}
|
||||
|
||||
static int nvme_zone_parse_entry(struct nvme_ns *ns,
|
||||
struct nvme_zone_descriptor *entry,
|
||||
unsigned int idx, report_zones_cb cb,
|
||||
void *data)
|
||||
{
|
||||
struct blk_zone zone = { };
|
||||
|
||||
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
|
||||
dev_err(ns->ctrl->device, "invalid zone type %#x\n",
|
||||
entry->zt);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
|
||||
zone.cond = entry->zs >> 4;
|
||||
zone.len = ns->zsze;
|
||||
zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
|
||||
zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
|
||||
zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
|
||||
|
||||
return cb(&zone, idx, data);
|
||||
}
|
||||
|
||||
static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data)
|
||||
{
|
||||
struct nvme_zone_report *report;
|
||||
int ret, zone_idx = 0;
|
||||
unsigned int nz, i;
|
||||
size_t buflen;
|
||||
|
||||
report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
|
||||
if (!report)
|
||||
return -ENOMEM;
|
||||
|
||||
sector &= ~(ns->zsze - 1);
|
||||
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
|
||||
memset(report, 0, buflen);
|
||||
ret = __nvme_ns_report_zones(ns, sector, report, buflen);
|
||||
if (ret < 0)
|
||||
goto out_free;
|
||||
|
||||
nz = min_t(unsigned int, ret, nr_zones);
|
||||
if (!nz)
|
||||
break;
|
||||
|
||||
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
|
||||
ret = nvme_zone_parse_entry(ns, &report->entries[i],
|
||||
zone_idx, cb, data);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
zone_idx++;
|
||||
}
|
||||
|
||||
sector += ns->zsze * nz;
|
||||
}
|
||||
|
||||
if (zone_idx > 0)
|
||||
ret = zone_idx;
|
||||
else
|
||||
ret = -EINVAL;
|
||||
out_free:
|
||||
kvfree(report);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int nvme_report_zones(struct gendisk *disk, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data)
|
||||
{
|
||||
struct nvme_ns_head *head = NULL;
|
||||
struct nvme_ns *ns;
|
||||
int srcu_idx, ret;
|
||||
|
||||
ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
|
||||
if (unlikely(!ns))
|
||||
return -EWOULDBLOCK;
|
||||
|
||||
if (ns->head->ids.csi == NVME_CSI_ZNS)
|
||||
ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
|
||||
else
|
||||
ret = -EINVAL;
|
||||
nvme_put_ns_from_disk(head, srcu_idx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
|
||||
struct nvme_command *c, enum nvme_zone_mgmt_action action)
|
||||
{
|
||||
c->zms.opcode = nvme_cmd_zone_mgmt_send;
|
||||
c->zms.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
||||
c->zms.zsa = action;
|
||||
|
||||
if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
|
||||
c->zms.select_all = 1;
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
@ -16,6 +16,18 @@ config NVME_TARGET
|
||||
To configure the NVMe target you probably want to use the nvmetcli
|
||||
tool from http://git.infradead.org/users/hch/nvmetcli.git.
|
||||
|
||||
config NVME_TARGET_PASSTHRU
|
||||
bool "NVMe Target Passthrough support"
|
||||
depends on NVME_TARGET
|
||||
depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
|
||||
help
|
||||
This enables target side NVMe passthru controller support for the
|
||||
NVMe Over Fabrics protocol. It allows for hosts to manage and
|
||||
directly access an actual NVMe controller residing on the target
|
||||
side, incuding executing Vendor Unique Commands.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NVME_TARGET_LOOP
|
||||
tristate "NVMe loopback device support"
|
||||
depends on NVME_TARGET
|
||||
|
@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
|
||||
|
||||
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
|
||||
discovery.o io-cmd-file.o io-cmd-bdev.o
|
||||
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
|
||||
nvme-loop-y += loop.o
|
||||
nvmet-rdma-y += rdma.o
|
||||
nvmet-fc-y += fc.o
|
||||
|
@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
|
||||
u64 data_units_read = 0, data_units_written = 0;
|
||||
struct nvmet_ns *ns;
|
||||
struct nvmet_ctrl *ctrl;
|
||||
unsigned long idx;
|
||||
|
||||
ctrl = req->sq->ctrl;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
|
||||
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
|
||||
/* we don't have the right data for file backed ns */
|
||||
if (!ns->bdev)
|
||||
continue;
|
||||
@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
|
||||
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
|
||||
data_units_written += DIV_ROUND_UP(
|
||||
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
|
||||
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
put_unaligned_le64(host_reads, &slog->host_reads[0]);
|
||||
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
|
||||
@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvmet_ns *ns;
|
||||
unsigned long idx;
|
||||
u32 count = 0;
|
||||
|
||||
if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
|
||||
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
|
||||
if (ns->anagrpid == grpid)
|
||||
desc->nsids[count++] = cpu_to_le32(ns->nsid);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
desc->grpid = cpu_to_le32(grpid);
|
||||
@ -427,7 +423,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
|
||||
id->awupf = 0;
|
||||
|
||||
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
|
||||
if (ctrl->ops->has_keyed_sgls)
|
||||
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
|
||||
id->sgls |= cpu_to_le32(1 << 2);
|
||||
if (req->port->inline_data_size)
|
||||
id->sgls |= cpu_to_le32(1 << 20);
|
||||
@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
|
||||
static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvmet_ns *ns;
|
||||
unsigned long idx;
|
||||
u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
|
||||
__le32 *list;
|
||||
u16 status = 0;
|
||||
@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
|
||||
goto out;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
|
||||
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
|
||||
if (ns->nsid <= min_nsid)
|
||||
continue;
|
||||
list[i++] = cpu_to_le32(ns->nsid);
|
||||
if (i == buf_size / sizeof(__le32))
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
status = nvmet_copy_to_sgl(req, 0, list, buf_size);
|
||||
|
||||
@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
void nvmet_execute_set_features(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
|
||||
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
|
||||
@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
|
||||
nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
|
||||
}
|
||||
|
||||
static void nvmet_execute_get_features(struct nvmet_req *req)
|
||||
void nvmet_execute_get_features(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
|
||||
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
|
||||
@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
if (nvmet_req_passthru_ctrl(req))
|
||||
return nvmet_parse_passthru_admin_cmd(req);
|
||||
|
||||
switch (cmd->common.opcode) {
|
||||
case nvme_admin_get_log_page:
|
||||
req->execute = nvmet_execute_get_log_page;
|
||||
|
@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = {
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NVME_TARGET_PASSTHRU
|
||||
|
||||
static ssize_t nvmet_passthru_device_path_show(struct config_item *item,
|
||||
char *page)
|
||||
{
|
||||
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
|
||||
|
||||
return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path);
|
||||
}
|
||||
|
||||
static ssize_t nvmet_passthru_device_path_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
|
||||
size_t len;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&subsys->lock);
|
||||
|
||||
ret = -EBUSY;
|
||||
if (subsys->passthru_ctrl)
|
||||
goto out_unlock;
|
||||
|
||||
ret = -EINVAL;
|
||||
len = strcspn(page, "\n");
|
||||
if (!len)
|
||||
goto out_unlock;
|
||||
|
||||
kfree(subsys->passthru_ctrl_path);
|
||||
ret = -ENOMEM;
|
||||
subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL);
|
||||
if (!subsys->passthru_ctrl_path)
|
||||
goto out_unlock;
|
||||
|
||||
mutex_unlock(&subsys->lock);
|
||||
|
||||
return count;
|
||||
out_unlock:
|
||||
mutex_unlock(&subsys->lock);
|
||||
return ret;
|
||||
}
|
||||
CONFIGFS_ATTR(nvmet_passthru_, device_path);
|
||||
|
||||
static ssize_t nvmet_passthru_enable_show(struct config_item *item,
|
||||
char *page)
|
||||
{
|
||||
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
|
||||
|
||||
return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0);
|
||||
}
|
||||
|
||||
static ssize_t nvmet_passthru_enable_store(struct config_item *item,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
|
||||
bool enable;
|
||||
int ret = 0;
|
||||
|
||||
if (strtobool(page, &enable))
|
||||
return -EINVAL;
|
||||
|
||||
if (enable)
|
||||
ret = nvmet_passthru_ctrl_enable(subsys);
|
||||
else
|
||||
nvmet_passthru_ctrl_disable(subsys);
|
||||
|
||||
return ret ? ret : count;
|
||||
}
|
||||
CONFIGFS_ATTR(nvmet_passthru_, enable);
|
||||
|
||||
static struct configfs_attribute *nvmet_passthru_attrs[] = {
|
||||
&nvmet_passthru_attr_device_path,
|
||||
&nvmet_passthru_attr_enable,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct config_item_type nvmet_passthru_type = {
|
||||
.ct_attrs = nvmet_passthru_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
|
||||
{
|
||||
config_group_init_type_name(&subsys->passthru_group,
|
||||
"passthru", &nvmet_passthru_type);
|
||||
configfs_add_default_group(&subsys->passthru_group,
|
||||
&subsys->group);
|
||||
}
|
||||
|
||||
#else /* CONFIG_NVME_TARGET_PASSTHRU */
|
||||
|
||||
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
|
||||
|
||||
static int nvmet_port_subsys_allow_link(struct config_item *parent,
|
||||
struct config_item *target)
|
||||
{
|
||||
@ -862,14 +959,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item,
|
||||
struct nvmet_subsys *subsys = to_subsys(item);
|
||||
|
||||
if (NVME_TERTIARY(subsys->ver))
|
||||
return snprintf(page, PAGE_SIZE, "%d.%d.%d\n",
|
||||
(int)NVME_MAJOR(subsys->ver),
|
||||
(int)NVME_MINOR(subsys->ver),
|
||||
(int)NVME_TERTIARY(subsys->ver));
|
||||
return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n",
|
||||
NVME_MAJOR(subsys->ver),
|
||||
NVME_MINOR(subsys->ver),
|
||||
NVME_TERTIARY(subsys->ver));
|
||||
|
||||
return snprintf(page, PAGE_SIZE, "%d.%d\n",
|
||||
(int)NVME_MAJOR(subsys->ver),
|
||||
(int)NVME_MINOR(subsys->ver));
|
||||
return snprintf(page, PAGE_SIZE, "%llu.%llu\n",
|
||||
NVME_MAJOR(subsys->ver),
|
||||
NVME_MINOR(subsys->ver));
|
||||
}
|
||||
|
||||
static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
|
||||
@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
|
||||
int major, minor, tertiary = 0;
|
||||
int ret;
|
||||
|
||||
/* passthru subsystems use the underlying controller's version */
|
||||
if (nvmet_passthru_ctrl(subsys))
|
||||
return -EINVAL;
|
||||
|
||||
ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
|
||||
if (ret != 2 && ret != 3)
|
||||
return -EINVAL;
|
||||
@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
|
||||
configfs_add_default_group(&subsys->allowed_hosts_group,
|
||||
&subsys->group);
|
||||
|
||||
nvmet_add_passthru_group(subsys);
|
||||
|
||||
return &subsys->group;
|
||||
}
|
||||
|
||||
|
@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
|
||||
|
||||
static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
|
||||
{
|
||||
struct nvmet_ns *ns;
|
||||
unsigned long nsid = 0;
|
||||
struct nvmet_ns *cur;
|
||||
unsigned long idx;
|
||||
|
||||
if (list_empty(&subsys->namespaces))
|
||||
return 0;
|
||||
xa_for_each(&subsys->namespaces, idx, cur)
|
||||
nsid = cur->nsid;
|
||||
|
||||
ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
|
||||
return ns->nsid;
|
||||
return nsid;
|
||||
}
|
||||
|
||||
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
|
||||
@ -336,7 +337,7 @@ int nvmet_enable_port(struct nvmet_port *port)
|
||||
* If the user requested PI support and the transport isn't pi capable,
|
||||
* don't enable the port.
|
||||
*/
|
||||
if (port->pi_enable && !ops->metadata_support) {
|
||||
if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) {
|
||||
pr_err("T10-PI is not supported by transport type %d\n",
|
||||
port->disc_addr.trtype);
|
||||
ret = -EINVAL;
|
||||
@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
|
||||
cancel_delayed_work_sync(&ctrl->ka_work);
|
||||
}
|
||||
|
||||
static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
|
||||
__le32 nsid)
|
||||
{
|
||||
struct nvmet_ns *ns;
|
||||
|
||||
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
|
||||
if (ns->nsid == le32_to_cpu(nsid))
|
||||
return ns;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
|
||||
{
|
||||
struct nvmet_ns *ns;
|
||||
|
||||
rcu_read_lock();
|
||||
ns = __nvmet_find_namespace(ctrl, nsid);
|
||||
ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
|
||||
if (ns)
|
||||
percpu_ref_get(&ns->ref);
|
||||
rcu_read_unlock();
|
||||
|
||||
return ns;
|
||||
}
|
||||
@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
|
||||
|
||||
mutex_lock(&subsys->lock);
|
||||
ret = 0;
|
||||
|
||||
if (nvmet_passthru_ctrl(subsys)) {
|
||||
pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
if (ns->enabled)
|
||||
goto out_unlock;
|
||||
|
||||
@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
|
||||
if (ns->nsid > subsys->max_nsid)
|
||||
subsys->max_nsid = ns->nsid;
|
||||
|
||||
/*
|
||||
* The namespaces list needs to be sorted to simplify the implementation
|
||||
* of the Identify Namepace List subcommand.
|
||||
*/
|
||||
if (list_empty(&subsys->namespaces)) {
|
||||
list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
|
||||
} else {
|
||||
struct nvmet_ns *old;
|
||||
ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
|
||||
if (ret)
|
||||
goto out_restore_subsys_maxnsid;
|
||||
|
||||
list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
|
||||
lockdep_is_held(&subsys->lock)) {
|
||||
BUG_ON(ns->nsid == old->nsid);
|
||||
if (ns->nsid < old->nsid)
|
||||
break;
|
||||
}
|
||||
|
||||
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
|
||||
}
|
||||
subsys->nr_namespaces++;
|
||||
|
||||
nvmet_ns_changed(subsys, ns->nsid);
|
||||
@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
|
||||
out_unlock:
|
||||
mutex_unlock(&subsys->lock);
|
||||
return ret;
|
||||
|
||||
out_restore_subsys_maxnsid:
|
||||
subsys->max_nsid = nvmet_max_nsid(subsys);
|
||||
percpu_ref_exit(&ns->ref);
|
||||
out_dev_put:
|
||||
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
|
||||
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
|
||||
@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
|
||||
goto out_unlock;
|
||||
|
||||
ns->enabled = false;
|
||||
list_del_rcu(&ns->dev_link);
|
||||
xa_erase(&ns->subsys->namespaces, ns->nsid);
|
||||
if (ns->nsid == subsys->max_nsid)
|
||||
subsys->max_nsid = nvmet_max_nsid(subsys);
|
||||
|
||||
@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
|
||||
if (!ns)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&ns->dev_link);
|
||||
init_completion(&ns->disable_done);
|
||||
|
||||
ns->nsid = nsid;
|
||||
@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
if (nvmet_req_passthru_ctrl(req))
|
||||
return nvmet_parse_passthru_io_cmd(req);
|
||||
|
||||
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
|
||||
if (unlikely(!req->ns)) {
|
||||
req->error_loc = offsetof(struct nvme_common_command, nsid);
|
||||
@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
|
||||
struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ns *ns;
|
||||
unsigned long idx;
|
||||
|
||||
if (!req->p2p_client)
|
||||
return;
|
||||
|
||||
ctrl->p2p_client = get_device(req->p2p_client);
|
||||
|
||||
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
|
||||
lockdep_is_held(&ctrl->subsys->lock))
|
||||
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
|
||||
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
|
||||
}
|
||||
|
||||
@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
|
||||
if (!subsys)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
|
||||
subsys->ver = NVMET_DEFAULT_VS;
|
||||
/* generate a random serial number as our controllers are ephemeral: */
|
||||
get_random_bytes(&subsys->serial, sizeof(subsys->serial));
|
||||
|
||||
@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
|
||||
kref_init(&subsys->ref);
|
||||
|
||||
mutex_init(&subsys->lock);
|
||||
INIT_LIST_HEAD(&subsys->namespaces);
|
||||
xa_init(&subsys->namespaces);
|
||||
INIT_LIST_HEAD(&subsys->ctrls);
|
||||
INIT_LIST_HEAD(&subsys->hosts);
|
||||
|
||||
@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref)
|
||||
struct nvmet_subsys *subsys =
|
||||
container_of(ref, struct nvmet_subsys, ref);
|
||||
|
||||
WARN_ON_ONCE(!list_empty(&subsys->namespaces));
|
||||
WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
|
||||
|
||||
xa_destroy(&subsys->namespaces);
|
||||
nvmet_passthru_subsys_free(subsys);
|
||||
|
||||
kfree(subsys->subsysnqn);
|
||||
kfree_rcu(subsys->model, rcuhead);
|
||||
|
@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
|
||||
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
|
||||
|
||||
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
|
||||
if (ctrl->ops->has_keyed_sgls)
|
||||
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
|
||||
id->sgls |= cpu_to_le32(1 << 2);
|
||||
if (req->port->inline_data_size)
|
||||
id->sgls |= cpu_to_le32(1 << 20);
|
||||
|
@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc {
|
||||
struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
|
||||
struct kref ref;
|
||||
struct work_struct del_work;
|
||||
atomic_t del_work_active;
|
||||
};
|
||||
|
||||
|
||||
@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work)
|
||||
container_of(work, struct nvmet_fc_tgt_assoc, del_work);
|
||||
|
||||
nvmet_fc_delete_target_assoc(assoc);
|
||||
atomic_set(&assoc->del_work_active, 0);
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
}
|
||||
|
||||
@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle)
|
||||
INIT_LIST_HEAD(&assoc->a_list);
|
||||
kref_init(&assoc->ref);
|
||||
INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
|
||||
atomic_set(&assoc->del_work_active, 0);
|
||||
atomic_set(&assoc->terminating, 0);
|
||||
|
||||
while (needrandom) {
|
||||
@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
|
||||
list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
|
||||
if (association_id == assoc->association_id) {
|
||||
ret = assoc;
|
||||
nvmet_fc_tgt_a_get(assoc);
|
||||
if (!nvmet_fc_tgt_a_get(assoc))
|
||||
ret = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1477,21 +1475,15 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
|
||||
{
|
||||
struct nvmet_fc_tgt_assoc *assoc, *next;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&tgtport->lock, flags);
|
||||
list_for_each_entry_safe(assoc, next,
|
||||
&tgtport->assoc_list, a_list) {
|
||||
if (!nvmet_fc_tgt_a_get(assoc))
|
||||
continue;
|
||||
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
|
||||
if (ret == 0) {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
} else {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
/* already deleting - release local reference */
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&tgtport->lock, flags);
|
||||
}
|
||||
@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
|
||||
struct nvmet_fc_tgt_assoc *assoc, *next;
|
||||
unsigned long flags;
|
||||
bool noassoc = true;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&tgtport->lock, flags);
|
||||
list_for_each_entry_safe(assoc, next,
|
||||
@ -1545,14 +1536,9 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
|
||||
continue;
|
||||
assoc->hostport->invalid = 1;
|
||||
noassoc = false;
|
||||
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
|
||||
if (ret == 0) {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
} else {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
/* already deleting - release local reference */
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&tgtport->lock, flags);
|
||||
|
||||
@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
|
||||
struct nvmet_fc_tgt_queue *queue;
|
||||
unsigned long flags;
|
||||
bool found_ctrl = false;
|
||||
int ret;
|
||||
|
||||
/* this is a bit ugly, but don't want to make locks layered */
|
||||
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
|
||||
@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
|
||||
nvmet_fc_tgtport_put(tgtport);
|
||||
|
||||
if (found_ctrl) {
|
||||
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
|
||||
if (ret == 0) {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
} else {
|
||||
if (!schedule_work(&assoc->del_work))
|
||||
/* already deleting - release local reference */
|
||||
nvmet_fc_tgt_a_put(assoc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,17 @@ static const match_table_t opt_tokens = {
|
||||
{ NVMF_OPT_ERR, NULL }
|
||||
};
|
||||
|
||||
static int fcloop_verify_addr(substring_t *s)
|
||||
{
|
||||
size_t blen = s->to - s->from + 1;
|
||||
|
||||
if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 ||
|
||||
strncmp(s->from, "0x", 2))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
fcloop_parse_options(struct fcloop_ctrl_options *opts,
|
||||
const char *buf)
|
||||
@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
|
||||
opts->mask |= token;
|
||||
switch (token) {
|
||||
case NVMF_OPT_WWNN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
opts->wwnn = token64;
|
||||
break;
|
||||
case NVMF_OPT_WWPN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
|
||||
opts->fcaddr = token;
|
||||
break;
|
||||
case NVMF_OPT_LPWWNN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
opts->lpwwnn = token64;
|
||||
break;
|
||||
case NVMF_OPT_LPWWPN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname,
|
||||
token = match_token(p, opt_tokens, args);
|
||||
switch (token) {
|
||||
case NVMF_OPT_WWNN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
*nname = token64;
|
||||
break;
|
||||
case NVMF_OPT_WWPN:
|
||||
if (match_u64(args, &token64)) {
|
||||
if (fcloop_verify_addr(args) ||
|
||||
match_u64(args, &token64)) {
|
||||
ret = -EINVAL;
|
||||
goto out_free_options;
|
||||
}
|
||||
|
@ -36,7 +36,6 @@ struct nvme_loop_ctrl {
|
||||
struct nvme_loop_iod async_event_iod;
|
||||
struct nvme_ctrl ctrl;
|
||||
|
||||
struct nvmet_ctrl *target_ctrl;
|
||||
struct nvmet_port *port;
|
||||
};
|
||||
|
||||
@ -445,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
|
||||
{
|
||||
struct nvme_loop_ctrl *ctrl =
|
||||
container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
|
||||
bool changed;
|
||||
int ret;
|
||||
|
||||
nvme_stop_ctrl(&ctrl->ctrl);
|
||||
@ -472,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
|
||||
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
|
||||
ctrl->ctrl.queue_count - 1);
|
||||
|
||||
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
|
||||
WARN_ON_ONCE(!changed);
|
||||
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
|
||||
WARN_ON_ONCE(1);
|
||||
|
||||
nvme_start_ctrl(&ctrl->ctrl);
|
||||
|
||||
@ -568,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
|
||||
struct nvmf_ctrl_options *opts)
|
||||
{
|
||||
struct nvme_loop_ctrl *ctrl;
|
||||
bool changed;
|
||||
int ret;
|
||||
|
||||
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
|
||||
@ -584,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
|
||||
if (ret)
|
||||
goto out_put_ctrl;
|
||||
|
||||
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
|
||||
WARN_ON_ONCE(1);
|
||||
|
||||
ret = -ENOMEM;
|
||||
|
||||
ctrl->ctrl.sqsize = opts->queue_size - 1;
|
||||
@ -618,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
|
||||
dev_info(ctrl->ctrl.device,
|
||||
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
|
||||
|
||||
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
|
||||
WARN_ON_ONCE(!changed);
|
||||
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
|
||||
WARN_ON_ONCE(1);
|
||||
|
||||
mutex_lock(&nvme_loop_ctrl_mutex);
|
||||
list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
|
||||
|
@ -21,6 +21,8 @@
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/t10-pi.h>
|
||||
|
||||
#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
|
||||
|
||||
#define NVMET_ASYNC_EVENTS 4
|
||||
#define NVMET_ERROR_LOG_SLOTS 128
|
||||
#define NVMET_NO_ERROR_LOC ((u16)-1)
|
||||
@ -52,7 +54,6 @@
|
||||
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
|
||||
|
||||
struct nvmet_ns {
|
||||
struct list_head dev_link;
|
||||
struct percpu_ref ref;
|
||||
struct block_device *bdev;
|
||||
struct file *file;
|
||||
@ -219,7 +220,7 @@ struct nvmet_subsys {
|
||||
struct mutex lock;
|
||||
struct kref ref;
|
||||
|
||||
struct list_head namespaces;
|
||||
struct xarray namespaces;
|
||||
unsigned int nr_namespaces;
|
||||
unsigned int max_nsid;
|
||||
u16 cntlid_min;
|
||||
@ -243,6 +244,12 @@ struct nvmet_subsys {
|
||||
struct config_group allowed_hosts_group;
|
||||
|
||||
struct nvmet_subsys_model __rcu *model;
|
||||
|
||||
#ifdef CONFIG_NVME_TARGET_PASSTHRU
|
||||
struct nvme_ctrl *passthru_ctrl;
|
||||
char *passthru_ctrl_path;
|
||||
struct config_group passthru_group;
|
||||
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
|
||||
};
|
||||
|
||||
static inline struct nvmet_subsys *to_subsys(struct config_item *item)
|
||||
@ -286,8 +293,9 @@ struct nvmet_fabrics_ops {
|
||||
struct module *owner;
|
||||
unsigned int type;
|
||||
unsigned int msdbd;
|
||||
bool has_keyed_sgls : 1;
|
||||
bool metadata_support : 1;
|
||||
unsigned int flags;
|
||||
#define NVMF_KEYED_SGLS (1 << 0)
|
||||
#define NVMF_METADATA_SUPPORTED (1 << 1)
|
||||
void (*queue_response)(struct nvmet_req *req);
|
||||
int (*add_port)(struct nvmet_port *port);
|
||||
void (*remove_port)(struct nvmet_port *port);
|
||||
@ -321,6 +329,11 @@ struct nvmet_req {
|
||||
struct bio_vec *bvec;
|
||||
struct work_struct work;
|
||||
} f;
|
||||
struct {
|
||||
struct request *rq;
|
||||
struct work_struct work;
|
||||
bool use_workqueue;
|
||||
} p;
|
||||
};
|
||||
int sg_cnt;
|
||||
int metadata_sg_cnt;
|
||||
@ -400,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
|
||||
int nvmet_req_alloc_sgls(struct nvmet_req *req);
|
||||
void nvmet_req_free_sgls(struct nvmet_req *req);
|
||||
|
||||
void nvmet_execute_set_features(struct nvmet_req *req);
|
||||
void nvmet_execute_get_features(struct nvmet_req *req);
|
||||
void nvmet_execute_keep_alive(struct nvmet_req *req);
|
||||
|
||||
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
|
||||
@ -532,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
|
||||
sizeof(struct nvme_dsm_range);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NVME_TARGET_PASSTHRU
|
||||
void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
|
||||
int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
|
||||
void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys);
|
||||
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
|
||||
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
|
||||
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
|
||||
{
|
||||
return subsys->passthru_ctrl;
|
||||
}
|
||||
#else /* CONFIG_NVME_TARGET_PASSTHRU */
|
||||
static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
|
||||
{
|
||||
}
|
||||
static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
|
||||
{
|
||||
}
|
||||
static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
|
||||
|
||||
static inline struct nvme_ctrl *
|
||||
nvmet_req_passthru_ctrl(struct nvmet_req *req)
|
||||
{
|
||||
return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
|
||||
}
|
||||
|
||||
u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
|
||||
|
||||
/* Convert a 32-bit number to a 16-bit 0's based number */
|
||||
|
544
drivers/nvme/target/passthru.c
Normal file
544
drivers/nvme/target/passthru.c
Normal file
@ -0,0 +1,544 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* NVMe Over Fabrics Target Passthrough command implementation.
|
||||
*
|
||||
* Copyright (c) 2017-2018 Western Digital Corporation or its
|
||||
* affiliates.
|
||||
* Copyright (c) 2019-2020, Eideticom Inc.
|
||||
*
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "../host/nvme.h"
|
||||
#include "nvmet.h"
|
||||
|
||||
MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU);
|
||||
|
||||
/*
|
||||
* xarray to maintain one passthru subsystem per nvme controller.
|
||||
*/
|
||||
static DEFINE_XARRAY(passthru_subsystems);
|
||||
|
||||
static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
|
||||
{
|
||||
struct nvmet_ctrl *ctrl = req->sq->ctrl;
|
||||
struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
|
||||
u16 status = NVME_SC_SUCCESS;
|
||||
struct nvme_id_ctrl *id;
|
||||
u32 max_hw_sectors;
|
||||
int page_shift;
|
||||
|
||||
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return NVME_SC_INTERNAL;
|
||||
|
||||
status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id));
|
||||
if (status)
|
||||
goto out_free;
|
||||
|
||||
id->cntlid = cpu_to_le16(ctrl->cntlid);
|
||||
id->ver = cpu_to_le32(ctrl->subsys->ver);
|
||||
|
||||
/*
|
||||
* The passthru NVMe driver may have a limit on the number of segments
|
||||
* which depends on the host's memory fragementation. To solve this,
|
||||
* ensure mdts is limited to the pages equal to the number of segments.
|
||||
*/
|
||||
max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9),
|
||||
pctrl->max_hw_sectors);
|
||||
|
||||
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
|
||||
|
||||
id->mdts = ilog2(max_hw_sectors) + 9 - page_shift;
|
||||
|
||||
id->acl = 3;
|
||||
/*
|
||||
* We export aerl limit for the fabrics controller, update this when
|
||||
* passthru based aerl support is added.
|
||||
*/
|
||||
id->aerl = NVMET_ASYNC_EVENTS - 1;
|
||||
|
||||
/* emulate kas as most of the PCIe ctrl don't have a support for kas */
|
||||
id->kas = cpu_to_le16(NVMET_KAS);
|
||||
|
||||
/* don't support host memory buffer */
|
||||
id->hmpre = 0;
|
||||
id->hmmin = 0;
|
||||
|
||||
id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes);
|
||||
id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes);
|
||||
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
|
||||
|
||||
/* don't support fuse commands */
|
||||
id->fuses = 0;
|
||||
|
||||
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
|
||||
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
|
||||
id->sgls |= cpu_to_le32(1 << 2);
|
||||
if (req->port->inline_data_size)
|
||||
id->sgls |= cpu_to_le32(1 << 20);
|
||||
|
||||
/*
|
||||
* When passsthru controller is setup using nvme-loop transport it will
|
||||
* export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in
|
||||
* the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl()
|
||||
* code path with duplicate ctr subsynqn. In order to prevent that we
|
||||
* mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn.
|
||||
*/
|
||||
memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn));
|
||||
|
||||
/* use fabric id-ctrl values */
|
||||
id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
|
||||
req->port->inline_data_size) / 16);
|
||||
id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
|
||||
|
||||
id->msdbd = ctrl->ops->msdbd;
|
||||
|
||||
/* Support multipath connections with fabrics */
|
||||
id->cmic |= 1 << 1;
|
||||
|
||||
/* Disable reservations, see nvmet_parse_passthru_io_cmd() */
|
||||
id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS);
|
||||
|
||||
status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl));
|
||||
|
||||
out_free:
|
||||
kfree(id);
|
||||
return status;
|
||||
}
|
||||
|
||||
static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req)
|
||||
{
|
||||
u16 status = NVME_SC_SUCCESS;
|
||||
struct nvme_id_ns *id;
|
||||
int i;
|
||||
|
||||
id = kzalloc(sizeof(*id), GFP_KERNEL);
|
||||
if (!id)
|
||||
return NVME_SC_INTERNAL;
|
||||
|
||||
status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns));
|
||||
if (status)
|
||||
goto out_free;
|
||||
|
||||
for (i = 0; i < (id->nlbaf + 1); i++)
|
||||
if (id->lbaf[i].ms)
|
||||
memset(&id->lbaf[i], 0, sizeof(id->lbaf[i]));
|
||||
|
||||
id->flbas = id->flbas & ~(1 << 4);
|
||||
|
||||
/*
|
||||
* Presently the NVMEof target code does not support sending
|
||||
* metadata, so we must disable it here. This should be updated
|
||||
* once target starts supporting metadata.
|
||||
*/
|
||||
id->mc = 0;
|
||||
|
||||
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
|
||||
|
||||
out_free:
|
||||
kfree(id);
|
||||
return status;
|
||||
}
|
||||
|
||||
static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
|
||||
{
|
||||
struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
|
||||
struct request *rq = req->p.rq;
|
||||
u16 status;
|
||||
|
||||
nvme_execute_passthru_rq(rq);
|
||||
|
||||
status = nvme_req(rq)->status;
|
||||
if (status == NVME_SC_SUCCESS &&
|
||||
req->cmd->common.opcode == nvme_admin_identify) {
|
||||
switch (req->cmd->identify.cns) {
|
||||
case NVME_ID_CNS_CTRL:
|
||||
nvmet_passthru_override_id_ctrl(req);
|
||||
break;
|
||||
case NVME_ID_CNS_NS:
|
||||
nvmet_passthru_override_id_ns(req);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
req->cqe->result = nvme_req(rq)->result;
|
||||
nvmet_req_complete(req, status);
|
||||
blk_put_request(rq);
|
||||
}
|
||||
|
||||
static void nvmet_passthru_req_done(struct request *rq,
|
||||
blk_status_t blk_status)
|
||||
{
|
||||
struct nvmet_req *req = rq->end_io_data;
|
||||
|
||||
req->cqe->result = nvme_req(rq)->result;
|
||||
nvmet_req_complete(req, nvme_req(rq)->status);
|
||||
blk_put_request(rq);
|
||||
}
|
||||
|
||||
static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
|
||||
{
|
||||
int sg_cnt = req->sg_cnt;
|
||||
struct scatterlist *sg;
|
||||
int op_flags = 0;
|
||||
struct bio *bio;
|
||||
int i, ret;
|
||||
|
||||
if (req->cmd->common.opcode == nvme_cmd_flush)
|
||||
op_flags = REQ_FUA;
|
||||
else if (nvme_is_write(req->cmd))
|
||||
op_flags = REQ_SYNC | REQ_IDLE;
|
||||
|
||||
bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
|
||||
bio->bi_end_io = bio_put;
|
||||
bio->bi_opf = req_op(rq) | op_flags;
|
||||
|
||||
for_each_sg(req->sg, sg, req->sg_cnt, i) {
|
||||
if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
|
||||
sg->offset) < sg->length) {
|
||||
bio_put(bio);
|
||||
return -EINVAL;
|
||||
}
|
||||
sg_cnt--;
|
||||
}
|
||||
|
||||
ret = blk_rq_append_bio(rq, &bio);
|
||||
if (unlikely(ret)) {
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
|
||||
struct request_queue *q = ctrl->admin_q;
|
||||
struct nvme_ns *ns = NULL;
|
||||
struct request *rq = NULL;
|
||||
u32 effects;
|
||||
u16 status;
|
||||
int ret;
|
||||
|
||||
if (likely(req->sq->qid != 0)) {
|
||||
u32 nsid = le32_to_cpu(req->cmd->common.nsid);
|
||||
|
||||
ns = nvme_find_get_ns(ctrl, nsid);
|
||||
if (unlikely(!ns)) {
|
||||
pr_err("failed to get passthru ns nsid:%u\n", nsid);
|
||||
status = NVME_SC_INVALID_NS | NVME_SC_DNR;
|
||||
goto fail_out;
|
||||
}
|
||||
|
||||
q = ns->queue;
|
||||
}
|
||||
|
||||
rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
|
||||
if (IS_ERR(rq)) {
|
||||
rq = NULL;
|
||||
status = NVME_SC_INTERNAL;
|
||||
goto fail_out;
|
||||
}
|
||||
|
||||
if (req->sg_cnt) {
|
||||
ret = nvmet_passthru_map_sg(req, rq);
|
||||
if (unlikely(ret)) {
|
||||
status = NVME_SC_INTERNAL;
|
||||
goto fail_out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If there are effects for the command we are about to execute, or
|
||||
* an end_req function we need to use nvme_execute_passthru_rq()
|
||||
* synchronously in a work item seeing the end_req function and
|
||||
* nvme_passthru_end() can't be called in the request done callback
|
||||
* which is typically in interrupt context.
|
||||
*/
|
||||
effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode);
|
||||
if (req->p.use_workqueue || effects) {
|
||||
INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work);
|
||||
req->p.rq = rq;
|
||||
schedule_work(&req->p.work);
|
||||
} else {
|
||||
rq->end_io_data = req;
|
||||
blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
|
||||
nvmet_passthru_req_done);
|
||||
}
|
||||
|
||||
if (ns)
|
||||
nvme_put_ns(ns);
|
||||
|
||||
return;
|
||||
|
||||
fail_out:
|
||||
if (ns)
|
||||
nvme_put_ns(ns);
|
||||
nvmet_req_complete(req, status);
|
||||
blk_put_request(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to emulate set host behaviour to ensure that any requested
|
||||
* behaviour of the target's host matches the requested behaviour
|
||||
* of the device's host and fail otherwise.
|
||||
*/
|
||||
static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
|
||||
struct nvme_feat_host_behavior *host;
|
||||
u16 status = NVME_SC_INTERNAL;
|
||||
int ret;
|
||||
|
||||
host = kzalloc(sizeof(*host) * 2, GFP_KERNEL);
|
||||
if (!host)
|
||||
goto out_complete_req;
|
||||
|
||||
ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
|
||||
host, sizeof(*host), NULL);
|
||||
if (ret)
|
||||
goto out_free_host;
|
||||
|
||||
status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host));
|
||||
if (status)
|
||||
goto out_free_host;
|
||||
|
||||
if (memcmp(&host[0], &host[1], sizeof(host[0]))) {
|
||||
pr_warn("target host has requested different behaviour from the local host\n");
|
||||
status = NVME_SC_INTERNAL;
|
||||
}
|
||||
|
||||
out_free_host:
|
||||
kfree(host);
|
||||
out_complete_req:
|
||||
nvmet_req_complete(req, status);
|
||||
}
|
||||
|
||||
static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
|
||||
{
|
||||
req->p.use_workqueue = false;
|
||||
req->execute = nvmet_passthru_execute_cmd;
|
||||
return NVME_SC_SUCCESS;
|
||||
}
|
||||
|
||||
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
|
||||
{
|
||||
switch (req->cmd->common.opcode) {
|
||||
case nvme_cmd_resv_register:
|
||||
case nvme_cmd_resv_report:
|
||||
case nvme_cmd_resv_acquire:
|
||||
case nvme_cmd_resv_release:
|
||||
/*
|
||||
* Reservations cannot be supported properly because the
|
||||
* underlying device has no way of differentiating different
|
||||
* hosts that connect via fabrics. This could potentially be
|
||||
* emulated in the future if regular targets grow support for
|
||||
* this feature.
|
||||
*/
|
||||
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
|
||||
}
|
||||
|
||||
return nvmet_setup_passthru_command(req);
|
||||
}
|
||||
|
||||
/*
|
||||
* Only features that are emulated or specifically allowed in the list are
|
||||
* passed down to the controller. This function implements the allow list for
|
||||
* both get and set features.
|
||||
*/
|
||||
static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
|
||||
{
|
||||
switch (le32_to_cpu(req->cmd->features.fid)) {
|
||||
case NVME_FEAT_ARBITRATION:
|
||||
case NVME_FEAT_POWER_MGMT:
|
||||
case NVME_FEAT_LBA_RANGE:
|
||||
case NVME_FEAT_TEMP_THRESH:
|
||||
case NVME_FEAT_ERR_RECOVERY:
|
||||
case NVME_FEAT_VOLATILE_WC:
|
||||
case NVME_FEAT_WRITE_ATOMIC:
|
||||
case NVME_FEAT_AUTO_PST:
|
||||
case NVME_FEAT_TIMESTAMP:
|
||||
case NVME_FEAT_HCTM:
|
||||
case NVME_FEAT_NOPSC:
|
||||
case NVME_FEAT_RRL:
|
||||
case NVME_FEAT_PLM_CONFIG:
|
||||
case NVME_FEAT_PLM_WINDOW:
|
||||
case NVME_FEAT_HOST_BEHAVIOR:
|
||||
case NVME_FEAT_SANITIZE:
|
||||
case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END:
|
||||
return nvmet_setup_passthru_command(req);
|
||||
|
||||
case NVME_FEAT_ASYNC_EVENT:
|
||||
/* There is no support for forwarding ASYNC events */
|
||||
case NVME_FEAT_IRQ_COALESCE:
|
||||
case NVME_FEAT_IRQ_CONFIG:
|
||||
/* The IRQ settings will not apply to the target controller */
|
||||
case NVME_FEAT_HOST_MEM_BUF:
|
||||
/*
|
||||
* Any HMB that's set will not be passed through and will
|
||||
* not work as expected
|
||||
*/
|
||||
case NVME_FEAT_SW_PROGRESS:
|
||||
/*
|
||||
* The Pre-Boot Software Load Count doesn't make much
|
||||
* sense for a target to export
|
||||
*/
|
||||
case NVME_FEAT_RESV_MASK:
|
||||
case NVME_FEAT_RESV_PERSIST:
|
||||
/* No reservations, see nvmet_parse_passthru_io_cmd() */
|
||||
default:
|
||||
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
|
||||
}
|
||||
}
|
||||
|
||||
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
|
||||
{
|
||||
/*
|
||||
* Passthru all vendor specific commands
|
||||
*/
|
||||
if (req->cmd->common.opcode >= nvme_admin_vendor_start)
|
||||
return nvmet_setup_passthru_command(req);
|
||||
|
||||
switch (req->cmd->common.opcode) {
|
||||
case nvme_admin_async_event:
|
||||
req->execute = nvmet_execute_async_event;
|
||||
return NVME_SC_SUCCESS;
|
||||
case nvme_admin_keep_alive:
|
||||
/*
|
||||
* Most PCIe ctrls don't support keep alive cmd, we route keep
|
||||
* alive to the non-passthru mode. In future please change this
|
||||
* code when PCIe ctrls with keep alive support available.
|
||||
*/
|
||||
req->execute = nvmet_execute_keep_alive;
|
||||
return NVME_SC_SUCCESS;
|
||||
case nvme_admin_set_features:
|
||||
switch (le32_to_cpu(req->cmd->features.fid)) {
|
||||
case NVME_FEAT_ASYNC_EVENT:
|
||||
case NVME_FEAT_KATO:
|
||||
case NVME_FEAT_NUM_QUEUES:
|
||||
case NVME_FEAT_HOST_ID:
|
||||
req->execute = nvmet_execute_set_features;
|
||||
return NVME_SC_SUCCESS;
|
||||
case NVME_FEAT_HOST_BEHAVIOR:
|
||||
req->execute = nvmet_passthru_set_host_behaviour;
|
||||
return NVME_SC_SUCCESS;
|
||||
default:
|
||||
return nvmet_passthru_get_set_features(req);
|
||||
}
|
||||
break;
|
||||
case nvme_admin_get_features:
|
||||
switch (le32_to_cpu(req->cmd->features.fid)) {
|
||||
case NVME_FEAT_ASYNC_EVENT:
|
||||
case NVME_FEAT_KATO:
|
||||
case NVME_FEAT_NUM_QUEUES:
|
||||
case NVME_FEAT_HOST_ID:
|
||||
req->execute = nvmet_execute_get_features;
|
||||
return NVME_SC_SUCCESS;
|
||||
default:
|
||||
return nvmet_passthru_get_set_features(req);
|
||||
}
|
||||
break;
|
||||
case nvme_admin_identify:
|
||||
switch (req->cmd->identify.cns) {
|
||||
case NVME_ID_CNS_CTRL:
|
||||
req->execute = nvmet_passthru_execute_cmd;
|
||||
req->p.use_workqueue = true;
|
||||
return NVME_SC_SUCCESS;
|
||||
case NVME_ID_CNS_NS:
|
||||
req->execute = nvmet_passthru_execute_cmd;
|
||||
req->p.use_workqueue = true;
|
||||
return NVME_SC_SUCCESS;
|
||||
default:
|
||||
return nvmet_setup_passthru_command(req);
|
||||
}
|
||||
case nvme_admin_get_log_page:
|
||||
return nvmet_setup_passthru_command(req);
|
||||
default:
|
||||
/* Reject commands not in the allowlist above */
|
||||
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
|
||||
}
|
||||
}
|
||||
|
||||
int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
|
||||
{
|
||||
struct nvme_ctrl *ctrl;
|
||||
int ret = -EINVAL;
|
||||
void *old;
|
||||
|
||||
mutex_lock(&subsys->lock);
|
||||
if (!subsys->passthru_ctrl_path)
|
||||
goto out_unlock;
|
||||
if (subsys->passthru_ctrl)
|
||||
goto out_unlock;
|
||||
|
||||
if (subsys->nr_namespaces) {
|
||||
pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ctrl = nvme_ctrl_get_by_path(subsys->passthru_ctrl_path);
|
||||
if (IS_ERR(ctrl)) {
|
||||
ret = PTR_ERR(ctrl);
|
||||
pr_err("failed to open nvme controller %s\n",
|
||||
subsys->passthru_ctrl_path);
|
||||
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL,
|
||||
subsys, GFP_KERNEL);
|
||||
if (xa_is_err(old)) {
|
||||
ret = xa_err(old);
|
||||
goto out_put_ctrl;
|
||||
}
|
||||
|
||||
if (old)
|
||||
goto out_put_ctrl;
|
||||
|
||||
subsys->passthru_ctrl = ctrl;
|
||||
subsys->ver = ctrl->vs;
|
||||
|
||||
if (subsys->ver < NVME_VS(1, 2, 1)) {
|
||||
pr_warn("nvme controller version is too old: %llu.%llu.%llu, advertising 1.2.1\n",
|
||||
NVME_MAJOR(subsys->ver), NVME_MINOR(subsys->ver),
|
||||
NVME_TERTIARY(subsys->ver));
|
||||
subsys->ver = NVME_VS(1, 2, 1);
|
||||
}
|
||||
|
||||
mutex_unlock(&subsys->lock);
|
||||
return 0;
|
||||
|
||||
out_put_ctrl:
|
||||
nvme_put_ctrl(ctrl);
|
||||
out_unlock:
|
||||
mutex_unlock(&subsys->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
|
||||
{
|
||||
if (subsys->passthru_ctrl) {
|
||||
xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid);
|
||||
nvme_put_ctrl(subsys->passthru_ctrl);
|
||||
}
|
||||
subsys->passthru_ctrl = NULL;
|
||||
subsys->ver = NVMET_DEFAULT_VS;
|
||||
}
|
||||
|
||||
void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
|
||||
{
|
||||
mutex_lock(&subsys->lock);
|
||||
__nvmet_passthru_ctrl_disable(subsys);
|
||||
mutex_unlock(&subsys->lock);
|
||||
}
|
||||
|
||||
void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
|
||||
{
|
||||
mutex_lock(&subsys->lock);
|
||||
__nvmet_passthru_ctrl_disable(subsys);
|
||||
mutex_unlock(&subsys->lock);
|
||||
kfree(subsys->passthru_ctrl_path);
|
||||
}
|
@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct nvmet_rdma_rsp *rsp =
|
||||
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
|
||||
struct nvmet_rdma_queue *queue = cq->cq_context;
|
||||
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
|
||||
u16 status = 0;
|
||||
|
||||
WARN_ON(rsp->n_rdma <= 0);
|
||||
@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
|
||||
{
|
||||
struct nvmet_rdma_cmd *cmd =
|
||||
container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
|
||||
struct nvmet_rdma_queue *queue = cq->cq_context;
|
||||
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
|
||||
struct nvmet_rdma_rsp *rsp;
|
||||
|
||||
if (unlikely(wc->status != IB_WC_SUCCESS)) {
|
||||
@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
|
||||
*/
|
||||
nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
|
||||
|
||||
queue->cq = ib_alloc_cq(ndev->device, queue,
|
||||
nr_cqe + 1, queue->comp_vector,
|
||||
IB_POLL_WORKQUEUE);
|
||||
queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
|
||||
queue->comp_vector, IB_POLL_WORKQUEUE);
|
||||
if (IS_ERR(queue->cq)) {
|
||||
ret = PTR_ERR(queue->cq);
|
||||
pr_err("failed to create CQ cqe= %d ret= %d\n",
|
||||
@ -1322,7 +1321,7 @@ out:
|
||||
err_destroy_qp:
|
||||
rdma_destroy_qp(queue->cm_id);
|
||||
err_destroy_cq:
|
||||
ib_free_cq(queue->cq);
|
||||
ib_cq_pool_put(queue->cq, nr_cqe + 1);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
|
||||
if (queue->cm_id)
|
||||
rdma_destroy_id(queue->cm_id);
|
||||
ib_destroy_qp(queue->qp);
|
||||
ib_free_cq(queue->cq);
|
||||
ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
|
||||
queue->send_queue_size + 1);
|
||||
}
|
||||
|
||||
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
|
||||
@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.type = NVMF_TRTYPE_RDMA,
|
||||
.msdbd = 1,
|
||||
.has_keyed_sgls = 1,
|
||||
.metadata_support = 1,
|
||||
.flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED,
|
||||
.add_port = nvmet_rdma_add_port,
|
||||
.remove_port = nvmet_rdma_remove_port,
|
||||
.queue_response = nvmet_rdma_queue_response,
|
||||
|
@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
|
||||
static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
|
||||
{
|
||||
struct llist_node *node;
|
||||
struct nvmet_tcp_cmd *cmd;
|
||||
|
||||
node = llist_del_all(&queue->resp_list);
|
||||
if (!node)
|
||||
return;
|
||||
|
||||
while (node) {
|
||||
struct nvmet_tcp_cmd *cmd = llist_entry(node,
|
||||
struct nvmet_tcp_cmd, lentry);
|
||||
|
||||
for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
|
||||
cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
|
||||
list_add(&cmd->entry, &queue->resp_send_list);
|
||||
node = node->next;
|
||||
queue->send_list_len++;
|
||||
}
|
||||
}
|
||||
@ -1717,7 +1711,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.type = NVMF_TRTYPE_TCP,
|
||||
.msdbd = 1,
|
||||
.has_keyed_sgls = 0,
|
||||
.add_port = nvmet_tcp_add_port,
|
||||
.remove_port = nvmet_tcp_remove_port,
|
||||
.queue_response = nvmet_tcp_queue_response,
|
||||
|
@ -319,7 +319,7 @@ dasd_diag_check_device(struct dasd_device *device)
|
||||
struct dasd_diag_characteristics *rdc_data;
|
||||
struct vtoc_cms_label *label;
|
||||
struct dasd_block *block;
|
||||
struct dasd_diag_bio bio;
|
||||
struct dasd_diag_bio *bio;
|
||||
unsigned int sb, bsize;
|
||||
blocknum_t end_block;
|
||||
int rc;
|
||||
@ -395,29 +395,36 @@ dasd_diag_check_device(struct dasd_device *device)
|
||||
rc = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
bio = kzalloc(sizeof(*bio), GFP_KERNEL);
|
||||
if (bio == NULL) {
|
||||
DBF_DEV_EVENT(DBF_WARNING, device, "%s",
|
||||
"No memory to allocate initialization bio");
|
||||
rc = -ENOMEM;
|
||||
goto out_label;
|
||||
}
|
||||
rc = 0;
|
||||
end_block = 0;
|
||||
/* try all sizes - needed for ECKD devices */
|
||||
for (bsize = 512; bsize <= PAGE_SIZE; bsize <<= 1) {
|
||||
mdsk_init_io(device, bsize, 0, &end_block);
|
||||
memset(&bio, 0, sizeof (struct dasd_diag_bio));
|
||||
bio.type = MDSK_READ_REQ;
|
||||
bio.block_number = private->pt_block + 1;
|
||||
bio.buffer = label;
|
||||
memset(bio, 0, sizeof(*bio));
|
||||
bio->type = MDSK_READ_REQ;
|
||||
bio->block_number = private->pt_block + 1;
|
||||
bio->buffer = label;
|
||||
memset(&private->iob, 0, sizeof (struct dasd_diag_rw_io));
|
||||
private->iob.dev_nr = rdc_data->dev_nr;
|
||||
private->iob.key = 0;
|
||||
private->iob.flags = 0; /* do synchronous io */
|
||||
private->iob.block_count = 1;
|
||||
private->iob.interrupt_params = 0;
|
||||
private->iob.bio_list = &bio;
|
||||
private->iob.bio_list = bio;
|
||||
private->iob.flaga = DASD_DIAG_FLAGA_DEFAULT;
|
||||
rc = dia250(&private->iob, RW_BIO);
|
||||
if (rc == 3) {
|
||||
pr_warn("%s: A 64-bit DIAG call failed\n",
|
||||
dev_name(&device->cdev->dev));
|
||||
rc = -EOPNOTSUPP;
|
||||
goto out_label;
|
||||
goto out_bio;
|
||||
}
|
||||
mdsk_term_io(device);
|
||||
if (rc == 0)
|
||||
@ -427,7 +434,7 @@ dasd_diag_check_device(struct dasd_device *device)
|
||||
pr_warn("%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n",
|
||||
dev_name(&device->cdev->dev), rc);
|
||||
rc = -EIO;
|
||||
goto out_label;
|
||||
goto out_bio;
|
||||
}
|
||||
/* check for label block */
|
||||
if (memcmp(label->label_id, DASD_DIAG_CMS1,
|
||||
@ -457,6 +464,8 @@ dasd_diag_check_device(struct dasd_device *device)
|
||||
(rc == 4) ? ", read-only device" : "");
|
||||
rc = 0;
|
||||
}
|
||||
out_bio:
|
||||
kfree(bio);
|
||||
out_label:
|
||||
free_page((long) label);
|
||||
out:
|
||||
@ -506,7 +515,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
|
||||
struct req_iterator iter;
|
||||
struct bio_vec bv;
|
||||
char *dst;
|
||||
unsigned int count, datasize;
|
||||
unsigned int count;
|
||||
sector_t recid, first_rec, last_rec;
|
||||
unsigned int blksize, off;
|
||||
unsigned char rw_cmd;
|
||||
@ -534,10 +543,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
|
||||
if (count != last_rec - first_rec + 1)
|
||||
return ERR_PTR(-EINVAL);
|
||||
/* Build the request */
|
||||
datasize = sizeof(struct dasd_diag_req) +
|
||||
count*sizeof(struct dasd_diag_bio);
|
||||
cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev,
|
||||
blk_mq_rq_to_pdu(req));
|
||||
cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, struct_size(dreq, bio, count),
|
||||
memdev, blk_mq_rq_to_pdu(req));
|
||||
if (IS_ERR(cqr))
|
||||
return cqr;
|
||||
|
||||
|
@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
|
||||
zone.non_seq = 1;
|
||||
|
||||
zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8]));
|
||||
zone.capacity = zone.len;
|
||||
zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16]));
|
||||
zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24]));
|
||||
if (zone.type != ZBC_ZONE_TYPE_CONV &&
|
||||
@ -716,6 +717,11 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
|
||||
/* The drive satisfies the kernel restrictions: set it up */
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
|
||||
if (sdkp->zones_max_open == U32_MAX)
|
||||
blk_queue_max_open_zones(q, 0);
|
||||
else
|
||||
blk_queue_max_open_zones(q, sdkp->zones_max_open);
|
||||
blk_queue_max_active_zones(q, 0);
|
||||
nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
|
||||
|
||||
/* READ16/WRITE16 is mandatory for ZBC disks */
|
||||
|
@ -513,6 +513,8 @@ struct request_queue {
|
||||
unsigned int nr_zones;
|
||||
unsigned long *conv_zones_bitmap;
|
||||
unsigned long *seq_zones_wlock;
|
||||
unsigned int max_open_zones;
|
||||
unsigned int max_active_zones;
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
/*
|
||||
@ -722,6 +724,28 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
|
||||
return true;
|
||||
return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
|
||||
}
|
||||
|
||||
static inline void blk_queue_max_open_zones(struct request_queue *q,
|
||||
unsigned int max_open_zones)
|
||||
{
|
||||
q->max_open_zones = max_open_zones;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_max_open_zones(const struct request_queue *q)
|
||||
{
|
||||
return q->max_open_zones;
|
||||
}
|
||||
|
||||
static inline void blk_queue_max_active_zones(struct request_queue *q,
|
||||
unsigned int max_active_zones)
|
||||
{
|
||||
q->max_active_zones = max_active_zones;
|
||||
}
|
||||
|
||||
static inline unsigned int queue_max_active_zones(const struct request_queue *q)
|
||||
{
|
||||
return q->max_active_zones;
|
||||
}
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
|
||||
{
|
||||
@ -737,6 +761,14 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline unsigned int queue_max_open_zones(const struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline unsigned int queue_max_active_zones(const struct request_queue *q)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
static inline bool rq_is_sync(struct request *rq)
|
||||
@ -1520,6 +1552,24 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
if (q)
|
||||
return queue_max_open_zones(q);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
if (q)
|
||||
return queue_max_active_zones(q);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int queue_dma_alignment(const struct request_queue *q)
|
||||
{
|
||||
return q ? q->dma_alignment : 511;
|
||||
|
@ -672,7 +672,7 @@ enum {
|
||||
* Values set by the LLDD indicating completion status of the FCP operation.
|
||||
* Must be set prior to calling the done() callback.
|
||||
* @transferred_length: amount of DATA_OUT payload data received by a
|
||||
* a WRITEDATA operation. If not a WRITEDATA operation, value must
|
||||
* WRITEDATA operation. If not a WRITEDATA operation, value must
|
||||
* be set to 0. Should equal transfer_length on success.
|
||||
* @fcp_error: status of the FCP operation. Must be 0 on success; on failure
|
||||
* must be a NVME_SC_FC_xxxx value.
|
||||
|
@ -132,6 +132,7 @@ enum {
|
||||
#define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff)
|
||||
#define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf)
|
||||
#define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1)
|
||||
#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff)
|
||||
#define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf)
|
||||
#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf)
|
||||
|
||||
@ -162,7 +163,6 @@ enum {
|
||||
|
||||
enum {
|
||||
NVME_CC_ENABLE = 1 << 0,
|
||||
NVME_CC_CSS_NVM = 0 << 4,
|
||||
NVME_CC_EN_SHIFT = 0,
|
||||
NVME_CC_CSS_SHIFT = 4,
|
||||
NVME_CC_MPS_SHIFT = 7,
|
||||
@ -170,6 +170,9 @@ enum {
|
||||
NVME_CC_SHN_SHIFT = 14,
|
||||
NVME_CC_IOSQES_SHIFT = 16,
|
||||
NVME_CC_IOCQES_SHIFT = 20,
|
||||
NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT,
|
||||
NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT,
|
||||
NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT,
|
||||
NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT,
|
||||
NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT,
|
||||
NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT,
|
||||
@ -179,6 +182,8 @@ enum {
|
||||
NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT,
|
||||
NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
|
||||
NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
|
||||
NVME_CAP_CSS_NVM = 1 << 0,
|
||||
NVME_CAP_CSS_CSI = 1 << 6,
|
||||
NVME_CSTS_RDY = 1 << 0,
|
||||
NVME_CSTS_CFS = 1 << 1,
|
||||
NVME_CSTS_NSSRO = 1 << 4,
|
||||
@ -307,6 +312,7 @@ enum {
|
||||
NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1,
|
||||
NVME_CTRL_ONCS_DSM = 1 << 2,
|
||||
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
|
||||
NVME_CTRL_ONCS_RESERVATIONS = 1 << 5,
|
||||
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
|
||||
NVME_CTRL_VWC_PRESENT = 1 << 0,
|
||||
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
|
||||
@ -369,11 +375,37 @@ struct nvme_id_ns {
|
||||
__u8 vs[3712];
|
||||
};
|
||||
|
||||
struct nvme_zns_lbafe {
|
||||
__le64 zsze;
|
||||
__u8 zdes;
|
||||
__u8 rsvd9[7];
|
||||
};
|
||||
|
||||
struct nvme_id_ns_zns {
|
||||
__le16 zoc;
|
||||
__le16 ozcs;
|
||||
__le32 mar;
|
||||
__le32 mor;
|
||||
__le32 rrl;
|
||||
__le32 frl;
|
||||
__u8 rsvd20[2796];
|
||||
struct nvme_zns_lbafe lbafe[16];
|
||||
__u8 rsvd3072[768];
|
||||
__u8 vs[256];
|
||||
};
|
||||
|
||||
struct nvme_id_ctrl_zns {
|
||||
__u8 zasl;
|
||||
__u8 rsvd1[4095];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_ID_CNS_NS = 0x00,
|
||||
NVME_ID_CNS_CTRL = 0x01,
|
||||
NVME_ID_CNS_NS_ACTIVE_LIST = 0x02,
|
||||
NVME_ID_CNS_NS_DESC_LIST = 0x03,
|
||||
NVME_ID_CNS_CS_NS = 0x05,
|
||||
NVME_ID_CNS_CS_CTRL = 0x06,
|
||||
NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
|
||||
NVME_ID_CNS_NS_PRESENT = 0x11,
|
||||
NVME_ID_CNS_CTRL_NS_LIST = 0x12,
|
||||
@ -383,6 +415,11 @@ enum {
|
||||
NVME_ID_CNS_UUID_LIST = 0x17,
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_CSI_NVM = 0,
|
||||
NVME_CSI_ZNS = 2,
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_DIR_IDENTIFY = 0x00,
|
||||
NVME_DIR_STREAMS = 0x01,
|
||||
@ -435,11 +472,13 @@ struct nvme_ns_id_desc {
|
||||
#define NVME_NIDT_EUI64_LEN 8
|
||||
#define NVME_NIDT_NGUID_LEN 16
|
||||
#define NVME_NIDT_UUID_LEN 16
|
||||
#define NVME_NIDT_CSI_LEN 1
|
||||
|
||||
enum {
|
||||
NVME_NIDT_EUI64 = 0x01,
|
||||
NVME_NIDT_NGUID = 0x02,
|
||||
NVME_NIDT_UUID = 0x03,
|
||||
NVME_NIDT_CSI = 0x04,
|
||||
};
|
||||
|
||||
struct nvme_smart_log {
|
||||
@ -519,6 +558,27 @@ struct nvme_ana_rsp_hdr {
|
||||
__le16 rsvd10[3];
|
||||
};
|
||||
|
||||
struct nvme_zone_descriptor {
|
||||
__u8 zt;
|
||||
__u8 zs;
|
||||
__u8 za;
|
||||
__u8 rsvd3[5];
|
||||
__le64 zcap;
|
||||
__le64 zslba;
|
||||
__le64 wp;
|
||||
__u8 rsvd32[32];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_ZONE_TYPE_SEQWRITE_REQ = 0x2,
|
||||
};
|
||||
|
||||
struct nvme_zone_report {
|
||||
__le64 nr_zones;
|
||||
__u8 resv8[56];
|
||||
struct nvme_zone_descriptor entries[];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_SMART_CRIT_SPARE = 1 << 0,
|
||||
NVME_SMART_CRIT_TEMPERATURE = 1 << 1,
|
||||
@ -613,6 +673,9 @@ enum nvme_opcode {
|
||||
nvme_cmd_resv_report = 0x0e,
|
||||
nvme_cmd_resv_acquire = 0x11,
|
||||
nvme_cmd_resv_release = 0x15,
|
||||
nvme_cmd_zone_mgmt_send = 0x79,
|
||||
nvme_cmd_zone_mgmt_recv = 0x7a,
|
||||
nvme_cmd_zone_append = 0x7d,
|
||||
};
|
||||
|
||||
#define nvme_opcode_name(opcode) { opcode, #opcode }
|
||||
@ -751,6 +814,7 @@ struct nvme_rw_command {
|
||||
enum {
|
||||
NVME_RW_LR = 1 << 15,
|
||||
NVME_RW_FUA = 1 << 14,
|
||||
NVME_RW_APPEND_PIREMAP = 1 << 9,
|
||||
NVME_RW_DSM_FREQ_UNSPEC = 0,
|
||||
NVME_RW_DSM_FREQ_TYPICAL = 1,
|
||||
NVME_RW_DSM_FREQ_RARE = 2,
|
||||
@ -816,6 +880,53 @@ struct nvme_write_zeroes_cmd {
|
||||
__le16 appmask;
|
||||
};
|
||||
|
||||
enum nvme_zone_mgmt_action {
|
||||
NVME_ZONE_CLOSE = 0x1,
|
||||
NVME_ZONE_FINISH = 0x2,
|
||||
NVME_ZONE_OPEN = 0x3,
|
||||
NVME_ZONE_RESET = 0x4,
|
||||
NVME_ZONE_OFFLINE = 0x5,
|
||||
NVME_ZONE_SET_DESC_EXT = 0x10,
|
||||
};
|
||||
|
||||
struct nvme_zone_mgmt_send_cmd {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 command_id;
|
||||
__le32 nsid;
|
||||
__le32 cdw2[2];
|
||||
__le64 metadata;
|
||||
union nvme_data_ptr dptr;
|
||||
__le64 slba;
|
||||
__le32 cdw12;
|
||||
__u8 zsa;
|
||||
__u8 select_all;
|
||||
__u8 rsvd13[2];
|
||||
__le32 cdw14[2];
|
||||
};
|
||||
|
||||
struct nvme_zone_mgmt_recv_cmd {
|
||||
__u8 opcode;
|
||||
__u8 flags;
|
||||
__u16 command_id;
|
||||
__le32 nsid;
|
||||
__le64 rsvd2[2];
|
||||
union nvme_data_ptr dptr;
|
||||
__le64 slba;
|
||||
__le32 numd;
|
||||
__u8 zra;
|
||||
__u8 zrasf;
|
||||
__u8 pr;
|
||||
__u8 rsvd13;
|
||||
__le32 cdw14[2];
|
||||
};
|
||||
|
||||
enum {
|
||||
NVME_ZRA_ZONE_REPORT = 0,
|
||||
NVME_ZRASF_ZONE_REPORT_ALL = 0,
|
||||
NVME_REPORT_ZONE_PARTIAL = 1,
|
||||
};
|
||||
|
||||
/* Features */
|
||||
|
||||
enum {
|
||||
@ -872,6 +983,7 @@ enum nvme_admin_opcode {
|
||||
nvme_admin_security_recv = 0x82,
|
||||
nvme_admin_sanitize_nvm = 0x84,
|
||||
nvme_admin_get_lba_status = 0x86,
|
||||
nvme_admin_vendor_start = 0xC0,
|
||||
};
|
||||
|
||||
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
|
||||
@ -935,6 +1047,8 @@ enum {
|
||||
NVME_FEAT_RESV_MASK = 0x82,
|
||||
NVME_FEAT_RESV_PERSIST = 0x83,
|
||||
NVME_FEAT_WRITE_PROTECT = 0x84,
|
||||
NVME_FEAT_VENDOR_START = 0xC0,
|
||||
NVME_FEAT_VENDOR_END = 0xFF,
|
||||
NVME_LOG_ERROR = 0x01,
|
||||
NVME_LOG_SMART = 0x02,
|
||||
NVME_LOG_FW_SLOT = 0x03,
|
||||
@ -972,7 +1086,9 @@ struct nvme_identify {
|
||||
__u8 cns;
|
||||
__u8 rsvd3;
|
||||
__le16 ctrlid;
|
||||
__u32 rsvd11[5];
|
||||
__u8 rsvd11[3];
|
||||
__u8 csi;
|
||||
__u32 rsvd12[4];
|
||||
};
|
||||
|
||||
#define NVME_IDENTIFY_DATA_SIZE 4096
|
||||
@ -1086,7 +1202,9 @@ struct nvme_get_log_page_command {
|
||||
};
|
||||
__le64 lpo;
|
||||
};
|
||||
__u32 rsvd14[2];
|
||||
__u8 rsvd14[3];
|
||||
__u8 csi;
|
||||
__u32 rsvd15;
|
||||
};
|
||||
|
||||
struct nvme_directive_cmd {
|
||||
@ -1283,6 +1401,8 @@ struct nvme_command {
|
||||
struct nvme_format_cmd format;
|
||||
struct nvme_dsm_cmd dsm;
|
||||
struct nvme_write_zeroes_cmd write_zeroes;
|
||||
struct nvme_zone_mgmt_send_cmd zms;
|
||||
struct nvme_zone_mgmt_recv_cmd zmr;
|
||||
struct nvme_abort_cmd abort;
|
||||
struct nvme_get_log_page_command get_log_page;
|
||||
struct nvmf_common_command fabrics;
|
||||
@ -1416,6 +1536,18 @@ enum {
|
||||
NVME_SC_DISCOVERY_RESTART = 0x190,
|
||||
NVME_SC_AUTH_REQUIRED = 0x191,
|
||||
|
||||
/*
|
||||
* I/O Command Set Specific - Zoned commands:
|
||||
*/
|
||||
NVME_SC_ZONE_BOUNDARY_ERROR = 0x1b8,
|
||||
NVME_SC_ZONE_FULL = 0x1b9,
|
||||
NVME_SC_ZONE_READ_ONLY = 0x1ba,
|
||||
NVME_SC_ZONE_OFFLINE = 0x1bb,
|
||||
NVME_SC_ZONE_INVALID_WRITE = 0x1bc,
|
||||
NVME_SC_ZONE_TOO_MANY_ACTIVE = 0x1bd,
|
||||
NVME_SC_ZONE_TOO_MANY_OPEN = 0x1be,
|
||||
NVME_SC_ZONE_INVALID_TRANSITION = 0x1bf,
|
||||
|
||||
/*
|
||||
* Media and Data Integrity Errors:
|
||||
*/
|
||||
|
@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
|
||||
* Version 3: Cache device with new UUID format
|
||||
* Version 4: Backing device with data offset
|
||||
*/
|
||||
#define BCACHE_SB_VERSION_CDEV 0
|
||||
#define BCACHE_SB_VERSION_BDEV 1
|
||||
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
|
||||
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
|
||||
#define BCACHE_SB_MAX_VERSION 4
|
||||
#define BCACHE_SB_VERSION_CDEV 0
|
||||
#define BCACHE_SB_VERSION_BDEV 1
|
||||
#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
|
||||
#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
|
||||
#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES 5
|
||||
#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES 6
|
||||
#define BCACHE_SB_MAX_VERSION 6
|
||||
|
||||
#define SB_SECTOR 8
|
||||
#define SB_OFFSET (SB_SECTOR << SECTOR_SHIFT)
|
||||
@ -173,7 +175,12 @@ struct cache_sb_disk {
|
||||
|
||||
__le64 flags;
|
||||
__le64 seq;
|
||||
__le64 pad[8];
|
||||
|
||||
__le64 feature_compat;
|
||||
__le64 feature_incompat;
|
||||
__le64 feature_ro_compat;
|
||||
|
||||
__le64 pad[5];
|
||||
|
||||
union {
|
||||
struct {
|
||||
@ -206,10 +213,16 @@ struct cache_sb_disk {
|
||||
__le16 keys;
|
||||
};
|
||||
__le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */
|
||||
__le16 bucket_size_hi;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is for in-memory bcache super block.
|
||||
* NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member
|
||||
* size, ordering and even whole struct size may be different
|
||||
* from cache_sb_disk.
|
||||
*/
|
||||
struct cache_sb {
|
||||
__u64 csum;
|
||||
__u64 offset; /* sector where this sb was written */
|
||||
__u64 version;
|
||||
|
||||
@ -224,7 +237,10 @@ struct cache_sb {
|
||||
|
||||
__u64 flags;
|
||||
__u64 seq;
|
||||
__u64 pad[8];
|
||||
|
||||
__u64 feature_compat;
|
||||
__u64 feature_incompat;
|
||||
__u64 feature_ro_compat;
|
||||
|
||||
union {
|
||||
struct {
|
||||
@ -232,10 +248,9 @@ struct cache_sb {
|
||||
__u64 nbuckets; /* device size */
|
||||
|
||||
__u16 block_size; /* sectors */
|
||||
__u16 bucket_size; /* sectors */
|
||||
|
||||
__u16 nr_in_set;
|
||||
__u16 nr_this_dev;
|
||||
__u32 bucket_size; /* sectors */
|
||||
};
|
||||
struct {
|
||||
/* Backing devices */
|
||||
@ -262,7 +277,8 @@ struct cache_sb {
|
||||
static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
|
||||
{
|
||||
return sb->version == BCACHE_SB_VERSION_BDEV
|
||||
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
|
||||
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET
|
||||
|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES;
|
||||
}
|
||||
|
||||
BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1);
|
||||
|
@ -73,6 +73,15 @@ enum blk_zone_cond {
|
||||
BLK_ZONE_COND_OFFLINE = 0xF,
|
||||
};
|
||||
|
||||
/**
|
||||
* enum blk_zone_report_flags - Feature flags of reported zone descriptors.
|
||||
*
|
||||
* @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field.
|
||||
*/
|
||||
enum blk_zone_report_flags {
|
||||
BLK_ZONE_REP_CAPACITY = (1 << 0),
|
||||
};
|
||||
|
||||
/**
|
||||
* struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl.
|
||||
*
|
||||
@ -99,7 +108,9 @@ struct blk_zone {
|
||||
__u8 cond; /* Zone condition */
|
||||
__u8 non_seq; /* Non-sequential write resources active */
|
||||
__u8 reset; /* Reset write pointer recommended */
|
||||
__u8 reserved[36];
|
||||
__u8 resv[4];
|
||||
__u64 capacity; /* Zone capacity in number of sectors */
|
||||
__u8 reserved[24];
|
||||
};
|
||||
|
||||
/**
|
||||
@ -115,7 +126,7 @@ struct blk_zone {
|
||||
struct blk_zone_report {
|
||||
__u64 sector;
|
||||
__u32 nr_zones;
|
||||
__u8 reserved[4];
|
||||
__u32 flags;
|
||||
struct blk_zone zones[0];
|
||||
};
|
||||
|
||||
|
@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s {
|
||||
|
||||
/*
|
||||
* Notes:
|
||||
* - if an array is being reshaped (restriped) in order to change the
|
||||
* - if an array is being reshaped (restriped) in order to change
|
||||
* the number of active devices in the array, 'raid_disks' will be
|
||||
* the larger of the old and new numbers. 'delta_disks' will
|
||||
* be the "new - old". So if +ve, raid_disks is the new value, and
|
||||
|
Loading…
Reference in New Issue
Block a user