Skip to content

Commit e19ddb0

Browse files
bhumikasharma29neuenfeldttj
authored andcommitted
Add FS Health Check Failure Metric (opensearch-project#18435)
* Add FS Health Check Failure Metric Signed-off-by: Bhumika Sharma <[email protected]>
1 parent 18bb1f7 commit e19ddb0

File tree

4 files changed

+55
-20
lines changed

4 files changed

+55
-20
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
4949
- Added node-left metric to cluster manager ([#18421](https://github.com/opensearch-project/OpenSearch/pull/18421))
5050
- [Star tree] Remove star tree feature flag and add index setting to configure star tree search on index basis ([#18070](https://github.com/opensearch-project/OpenSearch/pull/18070))
5151
- Approximation Framework Enhancement: Update the BKD traversal logic to improve the performance on skewed data ([#18439](https://github.com/opensearch-project/OpenSearch/issues/18439))
52+
- Added FS Health Check Failure metric ([#18435](https://github.com/opensearch-project/OpenSearch/pull/18435))
5253

5354
### Changed
5455
- Create generic DocRequest to better categorize ActionRequests ([#18269](https://github.com/opensearch-project/OpenSearch/pull/18269)))

server/src/main/java/org/opensearch/monitor/fs/FsHealthService.java

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
import org.opensearch.env.NodeEnvironment;
4747
import org.opensearch.monitor.NodeHealthService;
4848
import org.opensearch.monitor.StatusInfo;
49+
import org.opensearch.telemetry.metrics.Counter;
50+
import org.opensearch.telemetry.metrics.MetricsRegistry;
4951
import org.opensearch.threadpool.Scheduler;
5052
import org.opensearch.threadpool.ThreadPool;
5153

@@ -74,6 +76,7 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
7476

7577
private static final Logger logger = LogManager.getLogger(FsHealthService.class);
7678
private final ThreadPool threadPool;
79+
private final MetricsRegistry metricsRegistry;
7780
private volatile boolean enabled;
7881
private volatile boolean brokenLock;
7982
private final TimeValue refreshInterval;
@@ -84,6 +87,8 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
8487
private volatile TimeValue healthyTimeoutThreshold;
8588
private final AtomicLong lastRunStartTimeMillis = new AtomicLong(Long.MIN_VALUE);
8689
private final AtomicBoolean checkInProgress = new AtomicBoolean();
90+
private final Counter fsHealthFailCounter;
91+
private static final String COUNTER_METRICS_UNIT = "1";
8792

8893
@Nullable
8994
private volatile Set<Path> unhealthyPaths;
@@ -115,14 +120,26 @@ public class FsHealthService extends AbstractLifecycleComponent implements NodeH
115120
Setting.Property.Dynamic
116121
);
117122

118-
public FsHealthService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool, NodeEnvironment nodeEnv) {
123+
public FsHealthService(
124+
Settings settings,
125+
ClusterSettings clusterSettings,
126+
ThreadPool threadPool,
127+
NodeEnvironment nodeEnv,
128+
MetricsRegistry metricsRegistry
129+
) {
119130
this.threadPool = threadPool;
120131
this.enabled = ENABLED_SETTING.get(settings);
121132
this.refreshInterval = REFRESH_INTERVAL_SETTING.get(settings);
122133
this.slowPathLoggingThreshold = SLOW_PATH_LOGGING_THRESHOLD_SETTING.get(settings);
123134
this.currentTimeMillisSupplier = threadPool::relativeTimeInMillis;
124135
this.healthyTimeoutThreshold = HEALTHY_TIMEOUT_SETTING.get(settings);
125136
this.nodeEnv = nodeEnv;
137+
this.metricsRegistry = metricsRegistry;
138+
fsHealthFailCounter = metricsRegistry.createCounter(
139+
"fsHealth.failure.count",
140+
"Counter for number of times FS health check has failed",
141+
COUNTER_METRICS_UNIT
142+
);
126143
clusterSettings.addSettingsUpdateConsumer(SLOW_PATH_LOGGING_THRESHOLD_SETTING, this::setSlowPathLoggingThreshold);
127144
clusterSettings.addSettingsUpdateConsumer(HEALTHY_TIMEOUT_SETTING, this::setHealthyTimeoutThreshold);
128145
clusterSettings.addSettingsUpdateConsumer(ENABLED_SETTING, this::setEnabled);
@@ -198,13 +215,21 @@ public void run() {
198215
} catch (Exception e) {
199216
logger.error("health check failed", e);
200217
} finally {
218+
emitMetric();
201219
if (checkEnabled) {
202220
boolean completed = checkInProgress.compareAndSet(true, false);
203221
assert completed;
204222
}
205223
}
206224
}
207225

226+
private void emitMetric() {
227+
StatusInfo healthStatus = getHealth();
228+
if (healthStatus.getStatus() == UNHEALTHY) {
229+
fsHealthFailCounter.add(1.0);
230+
}
231+
}
232+
208233
private void monitorFSHealth() {
209234
Set<Path> currentUnhealthyPaths = null;
210235
Path[] paths = null;

server/src/main/java/org/opensearch/node/Node.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,8 @@ protected Node(final Environment initialEnvironment, Collection<PluginInfo> clas
738738
settings,
739739
clusterService.getClusterSettings(),
740740
threadPool,
741-
nodeEnvironment
741+
nodeEnvironment,
742+
metricsRegistry
742743
);
743744
final SetOnce<RerouteService> rerouteServiceReference = new SetOnce<>();
744745
final InternalSnapshotsInfoService snapshotsInfoService = new InternalSnapshotsInfoService(

server/src/test/java/org/opensearch/monitor/fs/FsHealthServiceTests.java

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import org.opensearch.test.MockLogAppender;
4747
import org.opensearch.test.OpenSearchTestCase;
4848
import org.opensearch.test.junit.annotations.TestLogging;
49+
import org.opensearch.test.telemetry.TestInMemoryMetricsRegistry;
4950
import org.opensearch.threadpool.TestThreadPool;
5051
import org.opensearch.threadpool.ThreadPool;
5152
import org.junit.Before;
@@ -71,19 +72,27 @@
7172
public class FsHealthServiceTests extends OpenSearchTestCase {
7273

7374
private DeterministicTaskQueue deterministicTaskQueue;
75+
private TestInMemoryMetricsRegistry metricsRegistry;
7476

7577
@Before
7678
public void createObjects() {
7779
Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), "node").build();
7880
deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
81+
metricsRegistry = new TestInMemoryMetricsRegistry();
7982
}
8083

8184
public void testSchedulesHealthCheckAtRefreshIntervals() throws Exception {
8285
long refreshInterval = randomLongBetween(1000, 12000);
8386
final Settings settings = Settings.builder().put(FsHealthService.REFRESH_INTERVAL_SETTING.getKey(), refreshInterval + "ms").build();
8487
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
8588
try (NodeEnvironment env = newNodeEnvironment()) {
86-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, deterministicTaskQueue.getThreadPool(), env);
89+
FsHealthService fsHealthService = new FsHealthService(
90+
settings,
91+
clusterSettings,
92+
deterministicTaskQueue.getThreadPool(),
93+
env,
94+
metricsRegistry
95+
);
8796
final long startTimeMillis = deterministicTaskQueue.getCurrentTimeMillis();
8897
fsHealthService.doStart();
8998
assertFalse(deterministicTaskQueue.hasRunnableTasks());
@@ -117,17 +126,17 @@ public void testFailsHealthOnIOException() throws IOException {
117126
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
118127
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
119128
try (NodeEnvironment env = newNodeEnvironment()) {
120-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
129+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
121130
fsHealthService.new FsHealthMonitor().run();
122131
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
123132
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
124133

125134
// disrupt file system
126135
disruptFileSystemProvider.restrictPathPrefix(""); // disrupt all paths
127136
disruptFileSystemProvider.injectIOException.set(true);
128-
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
129137
fsHealthService.new FsHealthMonitor().run();
130138
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
139+
assertEquals(Integer.valueOf(1), metricsRegistry.getCounterStore().get("fsHealth.failure.count").getCounterValue());
131140
for (Path path : env.nodeDataPaths()) {
132141
assertTrue(fsHealthService.getHealth().getInfo().contains(path.toString()));
133142
}
@@ -160,7 +169,7 @@ public void testLoggingOnHungIO() throws Exception {
160169
MockLogAppender mockAppender = MockLogAppender.createForLoggers(LogManager.getLogger(FsHealthService.class));
161170
NodeEnvironment env = newNodeEnvironment()
162171
) {
163-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
172+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
164173
int counter = 0;
165174
for (Path path : env.nodeDataPaths()) {
166175
mockAppender.addExpectation(
@@ -202,7 +211,7 @@ public void testFailsHealthOnHungIOBeyondHealthyTimeout() throws Exception {
202211
PathUtilsForTesting.installMock(fileSystem);
203212
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
204213
try (NodeEnvironment env = newNodeEnvironment()) {
205-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
214+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
206215
logger.info("--> Initial health status prior to the first monitor run");
207216
StatusInfo fsHealth = fsHealthService.getHealth();
208217
assertEquals(HEALTHY, fsHealth.getStatus());
@@ -214,30 +223,29 @@ public void testFailsHealthOnHungIOBeyondHealthyTimeout() throws Exception {
214223
assertEquals("health check passed", fsHealth.getInfo());
215224
logger.info("--> Disrupt file system");
216225
disruptFileSystemProvider.injectIODelay.set(true);
217-
final FsHealthService fsHealthSrvc = new FsHealthService(settings, clusterSettings, testThreadPool, env);
218-
fsHealthSrvc.doStart();
226+
fsHealthService.doStart();
219227
waitUntil(
220-
() -> fsHealthSrvc.getHealth().getStatus() == UNHEALTHY,
228+
() -> fsHealthService.getHealth().getStatus() == UNHEALTHY,
221229
healthyTimeoutThreshold + (2 * refreshInterval),
222230
TimeUnit.MILLISECONDS
223231
);
224-
fsHealth = fsHealthSrvc.getHealth();
232+
fsHealth = fsHealthService.getHealth();
225233
assertEquals(UNHEALTHY, fsHealth.getStatus());
226234
assertEquals("healthy threshold breached", fsHealth.getInfo());
227235
int disruptedPathCount = disruptFileSystemProvider.getInjectedPathCount();
228236
assertThat(disruptedPathCount, equalTo(1));
229237
logger.info("--> Fix file system disruption");
230238
disruptFileSystemProvider.injectIODelay.set(false);
231239
waitUntil(
232-
() -> fsHealthSrvc.getHealth().getStatus() == HEALTHY,
240+
() -> fsHealthService.getHealth().getStatus() == HEALTHY,
233241
delayBetweenChecks + (4 * refreshInterval),
234242
TimeUnit.MILLISECONDS
235243
);
236-
fsHealth = fsHealthSrvc.getHealth();
244+
fsHealth = fsHealthService.getHealth();
237245
assertEquals(HEALTHY, fsHealth.getStatus());
238246
assertEquals("health check passed", fsHealth.getInfo());
239247
assertEquals(disruptedPathCount, disruptFileSystemProvider.getInjectedPathCount());
240-
fsHealthSrvc.doStop();
248+
fsHealthService.doStop();
241249
} finally {
242250
PathUtilsForTesting.teardown();
243251
ThreadPool.terminate(testThreadPool, 500, TimeUnit.MILLISECONDS);
@@ -254,7 +262,7 @@ public void testFailsHealthOnSinglePathFsyncFailure() throws IOException {
254262
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
255263
try (NodeEnvironment env = newNodeEnvironment()) {
256264
Path[] paths = env.nodeDataPaths();
257-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
265+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
258266
fsHealthService.new FsHealthMonitor().run();
259267
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
260268
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
@@ -263,9 +271,9 @@ public void testFailsHealthOnSinglePathFsyncFailure() throws IOException {
263271
disruptFsyncFileSystemProvider.injectIOException.set(true);
264272
String disruptedPath = randomFrom(paths).toString();
265273
disruptFsyncFileSystemProvider.restrictPathPrefix(disruptedPath);
266-
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
267274
fsHealthService.new FsHealthMonitor().run();
268275
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
276+
assertEquals(Integer.valueOf(1), metricsRegistry.getCounterStore().get("fsHealth.failure.count").getCounterValue());
269277
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed on [" + disruptedPath + "]"));
270278
assertEquals(1, disruptFsyncFileSystemProvider.getInjectedPathCount());
271279
} finally {
@@ -285,7 +293,7 @@ public void testFailsHealthOnSinglePathWriteFailure() throws IOException {
285293
TestThreadPool testThreadPool = new TestThreadPool(getClass().getName(), settings);
286294
try (NodeEnvironment env = newNodeEnvironment()) {
287295
Path[] paths = env.nodeDataPaths();
288-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
296+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
289297
fsHealthService.new FsHealthMonitor().run();
290298
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
291299
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
@@ -294,9 +302,9 @@ public void testFailsHealthOnSinglePathWriteFailure() throws IOException {
294302
String disruptedPath = randomFrom(paths).toString();
295303
disruptWritesFileSystemProvider.restrictPathPrefix(disruptedPath);
296304
disruptWritesFileSystemProvider.injectIOException.set(true);
297-
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
298305
fsHealthService.new FsHealthMonitor().run();
299306
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
307+
assertEquals(Integer.valueOf(1), metricsRegistry.getCounterStore().get("fsHealth.failure.count").getCounterValue());
300308
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed on [" + disruptedPath + "]"));
301309
assertEquals(1, disruptWritesFileSystemProvider.getInjectedPathCount());
302310
} finally {
@@ -319,17 +327,17 @@ public void testFailsHealthOnUnexpectedLockFileSize() throws IOException {
319327
PathUtilsForTesting.installMock(fileSystem);
320328
final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
321329
try (NodeEnvironment env = newNodeEnvironment()) {
322-
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
330+
FsHealthService fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env, metricsRegistry);
323331
fsHealthService.new FsHealthMonitor().run();
324332
assertEquals(HEALTHY, fsHealthService.getHealth().getStatus());
325333
assertEquals("health check passed", fsHealthService.getHealth().getInfo());
326334

327335
// enabling unexpected file size injection
328336
unexpectedLockFileSizeFileSystemProvider.injectUnexpectedFileSize.set(true);
329337

330-
fsHealthService = new FsHealthService(settings, clusterSettings, testThreadPool, env);
331338
fsHealthService.new FsHealthMonitor().run();
332339
assertEquals(UNHEALTHY, fsHealthService.getHealth().getStatus());
340+
assertEquals(Integer.valueOf(1), metricsRegistry.getCounterStore().get("fsHealth.failure.count").getCounterValue());
333341
assertThat(fsHealthService.getHealth().getInfo(), is("health check failed due to broken node lock"));
334342
assertEquals(1, unexpectedLockFileSizeFileSystemProvider.getInjectedPathCount());
335343
} finally {

0 commit comments

Comments
 (0)