From cd48282cc736377d5abf7c04de8c6ba864ba3794 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 11:03:09 -0700 Subject: nvme: stop aer posting if controller state not live If an nvme async_event command completes, in most cases, a new async event is posted. However, if the controller enters a resetting or reconnecting state, there is nothing to block the scheduled work element from posting the async event again. Nor are there calls from the transport to stop async events when an association dies. In the case of FC, where the association is torn down, the aer must be aborted on the FC link and completes through the normal job completion path. Thus the terminated async event ends up being rescheduled even though the controller isn't in a valid state for the aer, and the reposting gets the transport into a partially torn down data structure. It's possible to hit the scenario on rdma, although much less likely due to an aer completing right as the association is terminated and as the association teardown reclaims the blk requests via nvme_cancel_request() so its immediate, not a link-related action like on FC. Fix by putting controller state checks in both the async event completion routine where it schedules the async event and in the async event work routine before it calls into the transport. It's effectively a "stop_async_events()" behavior. The transport, when it creates a new association with the subsystem will transition the state back to live and is already restarting the async event posting. Signed-off-by: James Smart [hch: remove taking a lock over reading the controller state] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/nvme/host/core.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index acc816b67582..d470f031e27f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2590,7 +2590,7 @@ static void nvme_async_event_work(struct work_struct *work) container_of(work, struct nvme_ctrl, async_event_work); spin_lock_irq(&ctrl->lock); - while (ctrl->event_limit > 0) { + while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { int aer_idx = --ctrl->event_limit; spin_unlock_irq(&ctrl->lock); @@ -2677,7 +2677,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - queue_work(nvme_wq, &ctrl->async_event_work); + if (ctrl->state == NVME_CTRL_LIVE) + schedule_work(&ctrl->async_event_work); break; default: break; -- cgit v1.2.1 From 0951338d9677f546e230685d68631dfd3f81cca5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 7 Sep 2017 13:18:04 -0700 Subject: nvme: allow timed-out ios to retry Currently the nvme_req_needs_retry() applies several checks to see if a retry is allowed. On of those is whether the current time has exceeded the start time of the io plus the timeout length. This check, if an io times out, means there is never a retry allowed for the io. Which means applications see the io failure. Remove this check and allow the io to timeout, like it does on other protocols, and retries to be made. On the FC transport, a frame can be lost for an individual io, and there may be no other errors that escalate for the connection/association. The io will timeout, which causes the transport to escalate into creating a new association, but the io that timed out, due to this retry logic, has already failed back to the application and things are hosed. Signed-off-by: James Smart Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/nvme/host/core.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d470f031e27f..5589f67d2cd8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -134,8 +134,6 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->status & NVME_SC_DNR) return false; - if (jiffies - req->start_time >= req->timeout) - return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; return true; -- cgit v1.2.1 From 1a40d97288c6ffea9b355139e88fa62f0e5439f7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Sep 2017 17:01:36 +0300 Subject: nvme-core: Use nvme_wq to queue async events and fw activation async_event_work might race as it is executed from two different workqueues at the moment. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme/host/core.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5589f67d2cd8..bb2aad078637 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2676,7 +2676,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, case NVME_SC_ABORT_REQ: ++ctrl->event_limit; if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2691,7 +2691,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - schedule_work(&ctrl->fw_act_work); + queue_work(nvme_wq, &ctrl->fw_act_work); break; default: dev_warn(ctrl->device, "async event result %08x\n", result); -- cgit v1.2.1 From 007a61ae2f35c7fcf767313285c4924e81f11983 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 28 Sep 2017 21:33:23 +0200 Subject: nvme: fix visibility of "uuid" ns attribute "uuid" must be invisible if both ns->uuid and ns->nguid are unset, not if either one is. Fixes: d934f9848a77 "nvme: provide UUID value to userspace" Signed-off-by: Martin Wilck Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme/host/core.c') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb2aad078637..5a14cc7f28ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ns->uuid) || + if (uuid_is_null(&ns->uuid) && !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } -- cgit v1.2.1