|
| 1 | +From 27559037308c7c79f80e15959f67db47dbfffaf6 Mon Sep 17 00:00:00 2001 |
| 2 | +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= |
| 3 | + |
| 4 | +Date: Thu, 9 Oct 2025 14:51:04 +0200 |
| 5 | +Subject: [PATCH] xen/xenbus: better handle backend crash |
| 6 | +MIME-Version: 1.0 |
| 7 | +Content-Type: text/plain; charset=UTF-8 |
| 8 | +Content-Transfer-Encoding: 8bit |
| 9 | + |
| 10 | +When the backend domain crashes, coordinated device cleanup is not |
| 11 | +possible (as it involves waiting for the backend state change). In that |
| 12 | +case, toolstack forcefully removes frontend xenstore entries. |
| 13 | +xenbus_dev_changed() handles this case, and triggers device cleanup. |
| 14 | +It's possible that toolstack manages to connect new device in that |
| 15 | +place, before xenbus_dev_changed() notices the old one is missing. If |
| 16 | +that happens, new one won't be probed and will forever remain in |
| 17 | +XenbusStateInitialising. |
| 18 | + |
| 19 | +Fix this by checking backend-id and if it changes, consider it |
| 20 | +unplug+plug operation. It's important that cleanup on such unplug |
| 21 | +doesn't modify xenstore entries (especially the "state" key) as it |
| 22 | +belong to the new device to be probed - changing it would derail |
| 23 | +establishing connection to the new backend (most likely, closing the |
| 24 | +device before it was even connected). Handle this case by setting new |
| 25 | +xenbus_device->vanished flag to true, and check it before changing state |
| 26 | +entry. |
| 27 | + |
| 28 | +The problem does not apply to frontend domain crash, as this case |
| 29 | +involves coordinated cleanup. |
| 30 | + |
| 31 | +Problem originally reported at |
| 32 | +https://lore.kernel.org/xen-devel/aOZvivyZ9YhVWDLN@mail-itl/T/#t, |
| 33 | +including reproduction steps. |
| 34 | + |
| 35 | +Signed-off-by: Marek Marczykowski-Górecki < [email protected]> |
| 36 | +--- |
| 37 | +I considered re-using one of existing fields instead of a new |
| 38 | +xenbus_device->vanished, but I wasn't sure if that would work better. |
| 39 | +Setting xenbus_device->nodename to NULL would prevent few other places |
| 40 | +using it (including some log messages). Setting xenbus_device->otherend |
| 41 | +might have less unintentional impact, but logically it doesn't feel |
| 42 | +correct. |
| 43 | +--- |
| 44 | + drivers/xen/xenbus/xenbus_client.c | 2 ++ |
| 45 | + drivers/xen/xenbus/xenbus_probe.c | 19 +++++++++++++++++++ |
| 46 | + include/xen/xenbus.h | 1 + |
| 47 | + 3 files changed, 22 insertions(+) |
| 48 | + |
| 49 | +diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c |
| 50 | +index 51b3124b0d56c..21775cfbe4b83 100644 |
| 51 | +--- a/drivers/xen/xenbus/xenbus_client.c |
| 52 | ++++ b/drivers/xen/xenbus/xenbus_client.c |
| 53 | +@@ -274,6 +274,8 @@ __xenbus_switch_state(struct xenbus_device *dev, |
| 54 | + */ |
| 55 | + int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) |
| 56 | + { |
| 57 | ++ if (dev->vanished) |
| 58 | ++ return 0; |
| 59 | + return __xenbus_switch_state(dev, state, 0); |
| 60 | + } |
| 61 | + |
| 62 | +diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c |
| 63 | +index 86fe6e7790566..29db6a43a3b61 100644 |
| 64 | +--- a/drivers/xen/xenbus/xenbus_probe.c |
| 65 | ++++ b/drivers/xen/xenbus/xenbus_probe.c |
| 66 | +@@ -659,6 +659,25 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) |
| 67 | + return; |
| 68 | + |
| 69 | + dev = xenbus_device_find(root, &bus->bus); |
| 70 | ++ /* Backend domain crash results in not coordinated frontend removal, |
| 71 | ++ * without going through XenbusStateClosing. Check if the device |
| 72 | ++ * wasn't replaced to point at another backend in the meantime. |
| 73 | ++ */ |
| 74 | ++ if (dev && !strncmp(node, "device/", sizeof("device/")-1)) { |
| 75 | ++ int backend_id; |
| 76 | ++ int err = xenbus_gather(XBT_NIL, root, |
| 77 | ++ "backend-id", "%i", &backend_id, |
| 78 | ++ NULL); |
| 79 | ++ if (!err && backend_id != dev->otherend_id) { |
| 80 | ++ /* It isn't the same device, assume the old one |
| 81 | ++ * vanished and new one needs to be probed. |
| 82 | ++ */ |
| 83 | ++ dev->vanished = true; |
| 84 | ++ device_unregister(&dev->dev); |
| 85 | ++ put_device(&dev->dev); |
| 86 | ++ dev = NULL; |
| 87 | ++ } |
| 88 | ++ } |
| 89 | + if (!dev) |
| 90 | + xenbus_probe_node(bus, type, root); |
| 91 | + else |
| 92 | +diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h |
| 93 | +index 3f90bdd387b67..940352c9dc9bf 100644 |
| 94 | +--- a/include/xen/xenbus.h |
| 95 | ++++ b/include/xen/xenbus.h |
| 96 | +@@ -87,6 +87,7 @@ struct xenbus_device { |
| 97 | + struct completion down; |
| 98 | + struct work_struct work; |
| 99 | + struct semaphore reclaim_sem; |
| 100 | ++ bool vanished; |
| 101 | + |
| 102 | + /* Event channel based statistics and settings. */ |
| 103 | + atomic_t event_channels; |
| 104 | +-- |
| 105 | +2.51.0 |
| 106 | + |
0 commit comments