Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp

private static final String TESTS_FILE = "tests-";

private static final String METADATA_NAME_FORMAT = "meta-%s.dat";
private static final String METADATA_PREFIX = "meta-";

private static final String METADATA_NAME_FORMAT = METADATA_PREFIX + "%s.dat";

private static final String METADATA_CODEC = "metadata";

Expand Down Expand Up @@ -392,13 +394,16 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
}
// Delete snapshot from the index file, since it is the maintainer of truth of active snapshots
final RepositoryData updatedRepositoryData;
final RepositoryData repositoryData;
final Map<String, BlobContainer> foundIndices;
final Set<String> rootBlobs;
try {
final RepositoryData repositoryData = getRepositoryData();
repositoryData = getRepositoryData();
updatedRepositoryData = repositoryData.removeSnapshot(snapshotId);
// Cache the indices that were found before writing out the new index-N blob so that a stuck master will never
// delete an index that was created by another master node after writing this index-N blob.
foundIndices = blobStore().blobContainer(basePath().add("indices")).children();
rootBlobs = blobContainer().listBlobs().keySet();
writeIndexGen(updatedRepositoryData, repositoryStateId);
} catch (Exception ex) {
listener.onFailure(new RepositoryException(metadata.name(), "failed to delete snapshot [" + snapshotId + "]", ex));
Expand All @@ -420,12 +425,58 @@ public void deleteSnapshot(SnapshotId snapshotId, long repositoryStateId, Action
snapshotId,
ActionListener.map(listener, v -> {
cleanupStaleIndices(foundIndices, survivingIndices);
// Cleaning up according to repository data before the delete so we don't accidentally identify the two just deleted
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this too subtle, in particular because the cleanupStaleRootFiles method now operates on an old repository data, which is not clear when looking at that method itself.

Can you instead do something like the following:

diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
index 8b731f05a39..5c6695a93e0 100644
--- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
+++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@@ -59,6 +59,7 @@ import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentFactory;
@@ -101,6 +102,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
@@ -394,11 +396,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
             }
             // Delete snapshot from the index file, since it is the maintainer of truth of active snapshots
             final RepositoryData updatedRepositoryData;
-            final RepositoryData repositoryData;
             final Map<String, BlobContainer> foundIndices;
             final Set<String> rootBlobs;
             try {
-                repositoryData = getRepositoryData();
+                final RepositoryData repositoryData = getRepositoryData();
                 updatedRepositoryData = repositoryData.removeSnapshot(snapshotId);
                 // Cache the indices that were found before writing out the new index-N blob so that a stuck master will never
                 // delete an index that was created by another master node after writing this index-N blob.
@@ -410,9 +411,10 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
                 return;
             }
             final SnapshotInfo finalSnapshotInfo = snapshot;
+            final List<String> snapMetaFilesToDelete =
+                Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID()));
             try {
-                blobContainer().deleteBlobsIgnoringIfNotExists(
-                    Arrays.asList(snapshotFormat.blobName(snapshotId.getUUID()), globalMetaDataFormat.blobName(snapshotId.getUUID())));
+                blobContainer().deleteBlobsIgnoringIfNotExists(snapMetaFilesToDelete);
             } catch (IOException e) {
                 logger.warn(() -> new ParameterizedMessage("[{}] Unable to delete global metadata files", snapshotId), e);
             }
@@ -425,9 +427,8 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
                 snapshotId,
                 ActionListener.map(listener, v -> {
                     cleanupStaleIndices(foundIndices, survivingIndices);
-                    // Cleaning up according to repository data before the delete so we don't accidentally identify the two just deleted
-                    // blobs for the current snapshot as stale.
-                    cleanupStaleRootFiles(rootBlobs, repositoryData);
+                    // Remove snapMetaFilesToDelete, which have been deleted in a prior step, so that they are not identified as stale
+                    cleanupStaleRootFiles(Sets.difference(rootBlobs, new HashSet<>(snapMetaFilesToDelete)), updatedRepositoryData);
                     return null;
                 })
             );

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure will do :)

// blobs for the current snapshot as stale.
cleanupStaleRootFiles(rootBlobs, repositoryData);
return null;
})
);
}
}

private void cleanupStaleRootFiles(Set<String> rootBlobNames, RepositoryData repositoryData) {
final Set<String> allSnapshotIds =
repositoryData.getAllSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
final List<String> blobsToDelete = rootBlobNames.stream().filter(
blob -> {
if (FsBlobContainer.isTempBlobName(blob)) {
return true;
}
if (blob.endsWith(".dat")) {
final String foundUUID;
if (blob.startsWith(SNAPSHOT_PREFIX)) {
foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length());
assert snapshotFormat.blobName(foundUUID).equals(blob);
} else if (blob.startsWith(METADATA_PREFIX)) {
foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length());
assert globalMetaDataFormat.blobName(foundUUID).equals(blob);
} else {
return false;
}
return allSnapshotIds.contains(foundUUID) == false;
}
return false;
}
).collect(Collectors.toList());
if (blobsToDelete.isEmpty()) {
return;
}
try {
logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToDelete);
blobContainer().deleteBlobsIgnoringIfNotExists(blobsToDelete);
} catch (IOException e) {
logger.warn(() -> new ParameterizedMessage(
"[{}] The following blobs are no longer part of any snapshot [{}] but failed to remove them",
metadata.name(), blobsToDelete), e);
} catch (Exception e) {
// TODO: We shouldn't be blanket catching and suppressing all exceptions here and instead handle them safely upstream.
// Currently this catch exists as a stop gap solution to tackle unexpected runtime exceptions from implementations
// bubbling up and breaking the snapshot functionality.
assert false : e;
logger.warn(new ParameterizedMessage("[{}] Exception during cleanup of root level blobs", metadata.name()), e);
}
}

private void cleanupStaleIndices(Map<String, BlobContainer> foundIndices, Map<String, IndexId> survivingIndices) {
try {
final Set<String> survivingIndexIds = survivingIndices.values().stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.elasticsearch.threadpool.ThreadPool;

import java.io.ByteArrayInputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -237,6 +238,10 @@ protected void doRun() throws Exception {
final BlobStore blobStore = repo.blobStore();
blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo"))
.writeBlob("bar", new ByteArrayInputStream(new byte[0]), 0, false);
for (String prefix : Arrays.asList("snap-", "meta-")) {
blobStore.blobContainer(BlobPath.cleanPath())
.writeBlob(prefix + "foo.dat", new ByteArrayInputStream(new byte[0]), 0, false);
}
future.onResponse(null);
}
});
Expand All @@ -257,6 +262,8 @@ protected void doRun() throws Exception {
future.onResponse(
blobStore.blobContainer(BlobPath.cleanPath().add("indices")).children().containsKey("foo")
&& blobStore.blobContainer(BlobPath.cleanPath().add("indices").add("foo")).blobExists("bar")
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("meta-foo.dat")
&& blobStore.blobContainer(BlobPath.cleanPath()).blobExists("snap-foo.dat")
);
}
});
Expand Down