Skip to content

Commit 2768662

Browse files
Cleanup Stale Root Level Blobs in Sn. Repository (#43542) (#44226)
* Cleans up all root level temp., snap-%s.dat, meta-%s.dat blobs that aren't referenced by any snapshot to deal with dangling blobs left behind by delete and snapshot finalization failures * The scenario that get's us here is a snapshot failing before it was finalized or a delete failing right after it wrote the updated index-(N+1) that doesn't reference a snapshot anymore but then fails to remove that snapshot * Not deleting other dangling blobs since that don't follow the snap-, meta- or tempfile naming schemes to not accidentally delete blobs not created by the snapshot logic * Follow up to #42189 * Same safety logic, get list of all blobs before writing index-N blobs, delete things after index-N blobs was written
1 parent e9f9f00 commit 2768662

File tree

2 files changed

+55
-2
lines changed

2 files changed

+55
-2
lines changed

server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import org.elasticsearch.common.settings.Settings;
5959
import org.elasticsearch.common.unit.ByteSizeUnit;
6060
import org.elasticsearch.common.unit.ByteSizeValue;
61+
import org.elasticsearch.common.util.set.Sets;
6162
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
6263
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
6364
import org.elasticsearch.common.xcontent.XContentFactory;
@@ -100,6 +101,7 @@
100101
import java.util.Arrays;
101102
import java.util.Collection;
102103
import java.util.Collections;
104+
import java.util.HashSet;
103105
import java.util.List;
104106
import java.util.Map;
105107
import java.util.Optional;
@@ -433,9 +435,10 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
433435
return;
434436
}
435437
final SnapshotInfo finalSnapshotInfo = snapshot;
438+
final List<String> snapMetaFilesToDelete =
439+
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID()));
436440
try {
437-
blobContainer().deleteBlobsIgnoringIfNotExists(
438-
Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID())));
441+
blobContainer().deleteBlobsIgnoringIfNotExists(snapMetaFilesToDelete);
439442
} catch (IOException e) {
440443
logger.warn(() -> new ParameterizedMessage("[{}] Unable to delete global metadata files", snapshotId), e);
441444
}
@@ -448,12 +451,56 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
448451
snapshotId,
449452
ActionListener.map(listener, v -> {
450453
cleanupStaleIndices(foundIndices, survivingIndices);
454+
cleanupStaleRootFiles(Sets.difference(rootBlobs, new HashSet<>(snapMetaFilesToDelete)), updatedRepositoryData);
451455
return null;
452456
})
453457
);
454458
}
455459
}
456460

461+
private void cleanupStaleRootFiles(Set<String> rootBlobNames, RepositoryData repositoryData) {
462+
final Set<String> allSnapshotIds =
463+
repositoryData.getAllSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
464+
final List<String> blobsToDelete = rootBlobNames.stream().filter(
465+
blob -> {
466+
if (FsBlobContainer.isTempBlobName(blob)) {
467+
return true;
468+
}
469+
if (blob.endsWith(".dat")) {
470+
final String foundUUID;
471+
if (blob.startsWith(SNAPSHOT_PREFIX)) {
472+
foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length());
473+
assert snapshotFormat.blobName(foundUUID).equals(blob);
474+
} else if (blob.startsWith(METADATA_PREFIX)) {
475+
foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length());
476+
assert globalMetaDataFormat.blobName(foundUUID).equals(blob);
477+
} else {
478+
return false;
479+
}
480+
return allSnapshotIds.contains(foundUUID) == false;
481+
}
482+
return false;
483+
}
484+
).collect(Collectors.toList());
485+
if (blobsToDelete.isEmpty()) {
486+
return;
487+
}
488+
try {
489+
logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToDelete);
490+
blobContainer().deleteBlobsIgnoringIfNotExists(blobsToDelete);
491+
} catch (IOException e) {
492+
logger.warn(() -> new ParameterizedMessage(
493+
"[{}] The following blobs are no longer part of any snapshot [{}] but failed to remove them",
494+
metadata.name(), blobsToDelete), e);
495+
} catch (Exception e) {
496+
// TODO: We shouldn't be blanket catching and suppressing all exceptions here and instead handle them safely upstream.
497+
// Currently this catch exists as a stop gap solution to tackle unexpected runtime exceptions from implementations
498+
// bubbling up and breaking the snapshot functionality.
499+
assert false : e;
500+
logger.warn(new ParameterizedMessage("[{}] Exception during cleanup of root level blobs", metadata.name()), e);
501+
}
502+
}
503+
457504
private void cleanupStaleIndices(Map<String, BlobContainer> foundIndices, Map<String, IndexId> survivingIndices) {
458505
try {
459506
final Set<String> survivingIndexIds = survivingIndices.values().stream()

test/framework/src/main/java/org/elasticsearch/repositories/AbstractThirdPartyRepositoryTestCase.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,10 @@ protected void doRun() throws Exception {
236236
final BlobStore blobStore = repo.blobStore();
237237
blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo"))
238238
.writeBlob("bar", new ByteArrayInputStream(new byte[0]), 0, false);
239+
for (String prefix : Arrays.asList("snap-", "meta-")) {
240+
blobStore.blobContainer(BlobPath.cleanPath())
241+
.writeBlob(prefix + "foo.dat", new ByteArrayInputStream(new byte[0]), 0, false);
242+
}
239243
future.onResponse(null);
240244
}
241245
});
@@ -256,6 +260,8 @@ protected void doRun() throws Exception {
256260
future.onResponse(
257261
blobStore.blobContainer(BlobPath.cleanPath().add("indices")).children().containsKey("foo")
258262
&& blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")).blobExists("bar")
263+
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("meta-foo.dat")
264+
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("snap-foo.dat")
259265
);
260266
}
261267
});

0 commit comments

Comments
 (0)