Skip to content

Commit 6a4dab7

Browse files
committed
Optimize GC flow with pinned timestamps
Signed-off-by: Sachin Kale <[email protected]>
1 parent 4223fab commit 6a4dab7

File tree

11 files changed

+912
-230
lines changed

11 files changed

+912
-230
lines changed

server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStorePinnedTimestampsGarbageCollectionIT.java

Lines changed: 433 additions & 0 deletions
Large diffs are not rendered by default.

server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotV2IT.java

Lines changed: 189 additions & 114 deletions
Large diffs are not rendered by default.

server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,10 @@ Set<String> getMetadataFilesToFilterActiveSegments(
817817
return metadataFilesToFilterActiveSegments;
818818
}
819819

820+
public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException {
821+
deleteStaleSegments(lastNMetadataFilesToKeep, Map.of());
822+
}
823+
820824
/**
821825
* Delete stale segment and metadata files
822826
* One metadata file is kept per commit (refresh updates the same file). To read segments uploaded to remote store,
@@ -832,7 +836,7 @@ Set<String> getMetadataFilesToFilterActiveSegments(
832836
* @param lastNMetadataFilesToKeep number of metadata files to keep
833837
* @throws IOException in case of I/O error while reading from / writing to remote segment store
834838
*/
835-
public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException {
839+
private void deleteStaleSegments(int lastNMetadataFilesToKeep, Map<String, Long> pinnedTimestampsToSkip) throws IOException {
836840
if (lastNMetadataFilesToKeep == -1) {
837841
logger.info(
838842
"Stale segment deletion is disabled if cluster.remote_store.index.segment_metadata.retention.max_count is set to -1"
@@ -854,12 +858,12 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException
854858
}
855859

856860
// Check last fetch status of pinned timestamps. If stale, return.
857-
if (RemoteStoreUtils.isPinnedTimestampStateStale()) {
861+
if (lastNMetadataFilesToKeep != 0 && RemoteStoreUtils.isPinnedTimestampStateStale()) {
858862
logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale");
859863
return;
860864
}
861865

862-
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps();
866+
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(pinnedTimestampsToSkip);
863867

864868
Set<String> implicitLockedFiles = RemoteStoreUtils.getPinnedTimestampLockedFiles(
865869
sortedMetadataFileList,
@@ -994,7 +998,9 @@ public static void remoteDirectoryCleanup(
994998
String remoteStoreRepoForIndex,
995999
String indexUUID,
9961000
ShardId shardId,
997-
RemoteStorePathStrategy pathStrategy
1001+
RemoteStorePathStrategy pathStrategy,
1002+
boolean forceClean,
1003+
Map<String, Long> pinnedTimestampsToSkip
9981004
) {
9991005
try {
10001006
RemoteSegmentStoreDirectory remoteSegmentStoreDirectory = (RemoteSegmentStoreDirectory) remoteDirectoryFactory.newDirectory(
@@ -1003,8 +1009,12 @@ public static void remoteDirectoryCleanup(
10031009
shardId,
10041010
pathStrategy
10051011
);
1006-
remoteSegmentStoreDirectory.deleteStaleSegments(0);
1007-
remoteSegmentStoreDirectory.deleteIfEmpty();
1012+
if (forceClean) {
1013+
remoteSegmentStoreDirectory.delete();
1014+
} else {
1015+
remoteSegmentStoreDirectory.deleteStaleSegments(0, pinnedTimestampsToSkip);
1016+
remoteSegmentStoreDirectory.deleteIfEmpty();
1017+
}
10081018
} catch (Exception e) {
10091019
staticLogger.error("Exception occurred while deleting directory", e);
10101020
}
@@ -1023,7 +1033,10 @@ private boolean deleteIfEmpty() throws IOException {
10231033
logger.info("Remote directory still has files, not deleting the path");
10241034
return false;
10251035
}
1036+
return delete();
1037+
}
10261038

1039+
private boolean delete() {
10271040
try {
10281041
remoteDataDirectory.delete();
10291042
remoteMetadataDirectory.delete();

server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java

Lines changed: 109 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ public class RemoteFsTimestampAwareTranslog extends RemoteFsTranslog {
6161
private final Map<String, Tuple<Long, Long>> oldFormatMetadataFileGenerationMap;
6262
private final Map<String, Tuple<Long, Long>> oldFormatMetadataFilePrimaryTermMap;
6363
private final AtomicLong minPrimaryTermInRemote = new AtomicLong(Long.MAX_VALUE);
64+
private long maxDeletedGenerationOnRemote = 0;
6465

6566
public RemoteFsTimestampAwareTranslog(
6667
TranslogConfig config,
@@ -135,13 +136,20 @@ protected void trimUnreferencedReaders(boolean indexDeleted, boolean trimLocal)
135136

136137
// This is to ensure that after the permits are acquired during primary relocation, there are no further modification on remote
137138
// store.
138-
if (startedPrimarySupplier.getAsBoolean() == false || pauseSync.get()) {
139+
if ((indexDeleted == false && startedPrimarySupplier.getAsBoolean() == false) || pauseSync.get()) {
139140
return;
140141
}
141142

142143
// This is to fail fast and avoid listing md files un-necessarily.
143144
if (indexDeleted == false && RemoteStoreUtils.isPinnedTimestampStateStale()) {
144-
logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale");
145+
logger.warn("Skipping remote translog garbage collection as last fetch of pinned timestamp is stale");
146+
return;
147+
}
148+
149+
// This code block ensures parity with RemoteFsTranslog. Without this, we will end up making list translog metadata
150+
// call in each invocation of trimUnreferencedReaders
151+
long minGenerationToKeep = minRemoteGenReferenced - indexSettings().getRemoteTranslogExtraKeep();
152+
if (indexDeleted == false && (minGenerationToKeep <= maxDeletedGenerationOnRemote)) {
145153
return;
146154
}
147155

@@ -158,24 +166,20 @@ public void onResponse(List<BlobMetadata> blobMetadata) {
158166
List<String> metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList());
159167

160168
try {
161-
if (metadataFiles.size() <= 1) {
169+
if (indexDeleted == false && metadataFiles.size() <= 1) {
162170
logger.debug("No stale translog metadata files found");
163171
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
164172
return;
165173
}
166174

167175
// Check last fetch status of pinned timestamps. If stale, return.
168176
if (indexDeleted == false && RemoteStoreUtils.isPinnedTimestampStateStale()) {
169-
logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale");
177+
logger.warn("Skipping remote translog garbage collection as last fetch of pinned timestamp is stale");
170178
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
171179
return;
172180
}
173181

174-
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(
175-
metadataFiles,
176-
metadataFilePinnedTimestampMap,
177-
logger
178-
);
182+
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles, indexDeleted);
179183

180184
// If index is not deleted, make sure to keep latest metadata file
181185
if (indexDeleted == false) {
@@ -194,21 +198,28 @@ public void onResponse(List<BlobMetadata> blobMetadata) {
194198
metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted);
195199

196200
logger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted);
201+
197202
Set<Long> generationsToBeDeleted = getGenerationsToBeDeleted(
198203
metadataFilesNotToBeDeleted,
199204
metadataFilesToBeDeleted,
200-
indexDeleted
205+
indexDeleted ? Long.MAX_VALUE : minRemoteGenReferenced
201206
);
202207

203208
logger.debug(() -> "generationsToBeDeleted = " + generationsToBeDeleted);
204209
if (generationsToBeDeleted.isEmpty() == false) {
210+
maxDeletedGenerationOnRemote = generationsToBeDeleted.stream().max(Long::compareTo).get();
211+
205212
// Delete stale generations
206213
translogTransferManager.deleteGenerationAsync(
207214
primaryTermSupplier.getAsLong(),
208215
generationsToBeDeleted,
209216
remoteGenerationDeletionPermits::release
210217
);
218+
} else {
219+
remoteGenerationDeletionPermits.release();
220+
}
211221

222+
if (metadataFilesToBeDeleted.isEmpty() == false) {
212223
// Delete stale metadata files
213224
translogTransferManager.deleteMetadataFilesAsync(
214225
metadataFilesToBeDeleted,
@@ -217,11 +228,10 @@ public void onResponse(List<BlobMetadata> blobMetadata) {
217228

218229
// Update cache to keep only those metadata files that are not getting deleted
219230
oldFormatMetadataFileGenerationMap.keySet().retainAll(metadataFilesNotToBeDeleted);
220-
221231
// Delete stale primary terms
222232
deleteStaleRemotePrimaryTerms(metadataFilesNotToBeDeleted);
223233
} else {
224-
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
234+
remoteGenerationDeletionPermits.release();
225235
}
226236
} catch (Exception e) {
227237
remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS);
@@ -241,14 +251,8 @@ public void onFailure(Exception e) {
241251
protected Set<Long> getGenerationsToBeDeleted(
242252
List<String> metadataFilesNotToBeDeleted,
243253
List<String> metadataFilesToBeDeleted,
244-
boolean indexDeleted
254+
long minRemoteGenReferenced
245255
) throws IOException {
246-
long maxGenerationToBeDeleted = Long.MAX_VALUE;
247-
248-
if (indexDeleted == false) {
249-
maxGenerationToBeDeleted = minRemoteGenReferenced - 1 - indexSettings().getRemoteTranslogExtraKeep();
250-
}
251-
252256
Set<Long> generationsFromMetadataFilesToBeDeleted = new HashSet<>();
253257
for (String mdFile : metadataFilesToBeDeleted) {
254258
Tuple<Long, Long> minMaxGen = getMinMaxTranslogGenerationFromMetadataFile(mdFile, translogTransferManager);
@@ -262,24 +266,36 @@ protected Set<Long> getGenerationsToBeDeleted(
262266
Set<Long> generationsToBeDeleted = new HashSet<>();
263267
for (long generation : generationsFromMetadataFilesToBeDeleted) {
264268
// Check if the generation is not referred by metadata file matching pinned timestamps
265-
if (generation <= maxGenerationToBeDeleted && isGenerationPinned(generation, pinnedGenerations) == false) {
269+
// The check with minRemoteGenReferenced is redundant but kept as to make sure we don't delete generations
270+
// that are not persisted in remote segment store yet.
271+
if (generation < minRemoteGenReferenced && isGenerationPinned(generation, pinnedGenerations) == false) {
266272
generationsToBeDeleted.add(generation);
267273
}
268274
}
269275
return generationsToBeDeleted;
270276
}
271277

272-
protected List<String> getMetadataFilesToBeDeleted(List<String> metadataFiles) {
273-
return getMetadataFilesToBeDeleted(metadataFiles, metadataFilePinnedTimestampMap, logger);
278+
protected List<String> getMetadataFilesToBeDeleted(List<String> metadataFiles, boolean indexDeleted) {
279+
return getMetadataFilesToBeDeleted(
280+
metadataFiles,
281+
metadataFilePinnedTimestampMap,
282+
minRemoteGenReferenced,
283+
Map.of(),
284+
indexDeleted,
285+
logger
286+
);
274287
}
275288

276289
// Visible for testing
277290
protected static List<String> getMetadataFilesToBeDeleted(
278291
List<String> metadataFiles,
279292
Map<Long, String> metadataFilePinnedTimestampMap,
293+
long minRemoteGenReferenced,
294+
Map<String, Long> pinnedTimestampsToSkip,
295+
boolean indexDeleted,
280296
Logger logger
281297
) {
282-
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps();
298+
Tuple<Long, Set<Long>> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(pinnedTimestampsToSkip);
283299

284300
// Keep files since last successful run of scheduler
285301
List<String> metadataFilesToBeDeleted = RemoteStoreUtils.filterOutMetadataFilesBasedOnAge(
@@ -312,6 +328,22 @@ protected static List<String> getMetadataFilesToBeDeleted(
312328
metadataFilesToBeDeleted.size()
313329
);
314330

331+
if (indexDeleted == false) {
332+
// Filter out metadata files based on minRemoteGenReferenced
333+
List<String> metadataFilesContainingMinRemoteGenReferenced = metadataFilesToBeDeleted.stream().filter(md -> {
334+
long maxGeneration = TranslogTransferMetadata.getMaxGenerationFromFileName(md);
335+
return maxGeneration == -1 || maxGeneration > minRemoteGenReferenced;
336+
}).collect(Collectors.toList());
337+
metadataFilesToBeDeleted.removeAll(metadataFilesContainingMinRemoteGenReferenced);
338+
339+
logger.trace(
340+
"metadataFilesContainingMinRemoteGenReferenced.size = {}, metadataFilesToBeDeleted based on minRemoteGenReferenced filtering = {}, minRemoteGenReferenced = {}",
341+
metadataFilesContainingMinRemoteGenReferenced.size(),
342+
metadataFilesToBeDeleted.size(),
343+
minRemoteGenReferenced
344+
);
345+
}
346+
315347
return metadataFilesToBeDeleted;
316348
}
317349

@@ -472,50 +504,65 @@ protected static Tuple<Long, Long> getMinMaxPrimaryTermFromMetadataFile(
472504
}
473505
}
474506

475-
public static void cleanup(TranslogTransferManager translogTransferManager) throws IOException {
476-
ActionListener<List<BlobMetadata>> listMetadataFilesListener = new ActionListener<>() {
477-
@Override
478-
public void onResponse(List<BlobMetadata> blobMetadata) {
479-
List<String> metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList());
507+
public static void cleanup(
508+
TranslogTransferManager translogTransferManager,
509+
boolean forceClean,
510+
Map<String, Long> pinnedTimestampsToSkip
511+
) throws IOException {
512+
if (forceClean) {
513+
translogTransferManager.delete();
514+
} else {
515+
ActionListener<List<BlobMetadata>> listMetadataFilesListener = new ActionListener<>() {
516+
@Override
517+
public void onResponse(List<BlobMetadata> blobMetadata) {
518+
List<String> metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList());
519+
520+
try {
521+
if (metadataFiles.isEmpty()) {
522+
staticLogger.debug("No stale translog metadata files found");
523+
return;
524+
}
525+
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(
526+
metadataFiles,
527+
new HashMap<>(),
528+
Long.MAX_VALUE,
529+
pinnedTimestampsToSkip,
530+
true,
531+
staticLogger
532+
);
533+
if (metadataFilesToBeDeleted.isEmpty()) {
534+
staticLogger.debug("No metadata files to delete");
535+
return;
536+
}
537+
staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted);
480538

481-
try {
482-
if (metadataFiles.isEmpty()) {
483-
staticLogger.debug("No stale translog metadata files found");
484-
return;
485-
}
486-
List<String> metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles, new HashMap<>(), staticLogger);
487-
if (metadataFilesToBeDeleted.isEmpty()) {
488-
staticLogger.debug("No metadata files to delete");
489-
return;
490-
}
491-
staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted);
539+
// For all the files that we are keeping, fetch min and max generations
540+
List<String> metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles);
541+
metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted);
542+
staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted);
492543

493-
// For all the files that we are keeping, fetch min and max generations
494-
List<String> metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles);
495-
metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted);
496-
staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted);
544+
// Delete stale metadata files
545+
translogTransferManager.deleteMetadataFilesAsync(metadataFilesToBeDeleted, () -> {});
497546

498-
// Delete stale metadata files
499-
translogTransferManager.deleteMetadataFilesAsync(metadataFilesToBeDeleted, () -> {});
547+
// Delete stale primary terms
548+
deleteStaleRemotePrimaryTerms(
549+
metadataFilesNotToBeDeleted,
550+
translogTransferManager,
551+
new HashMap<>(),
552+
new AtomicLong(Long.MAX_VALUE),
553+
staticLogger
554+
);
555+
} catch (Exception e) {
556+
staticLogger.error("Exception while cleaning up metadata and primary terms", e);
557+
}
558+
}
500559

501-
// Delete stale primary terms
502-
deleteStaleRemotePrimaryTerms(
503-
metadataFilesNotToBeDeleted,
504-
translogTransferManager,
505-
new HashMap<>(),
506-
new AtomicLong(Long.MAX_VALUE),
507-
staticLogger
508-
);
509-
} catch (Exception e) {
560+
@Override
561+
public void onFailure(Exception e) {
510562
staticLogger.error("Exception while cleaning up metadata and primary terms", e);
511563
}
512-
}
513-
514-
@Override
515-
public void onFailure(Exception e) {
516-
staticLogger.error("Exception while cleaning up metadata and primary terms", e);
517-
}
518-
};
519-
translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener);
564+
};
565+
translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener);
566+
}
520567
}
521568
}

server/src/main/java/org/opensearch/index/translog/RemoteFsTranslog.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -685,10 +685,11 @@ public void onUploadComplete(TransferSnapshot transferSnapshot) throws IOExcepti
685685
maxRemoteTranslogGenerationUploaded = generation;
686686
minRemoteGenReferenced = getMinFileGeneration();
687687
logger.debug(
688-
"Successfully uploaded translog for primary term = {}, generation = {}, maxSeqNo = {}",
688+
"Successfully uploaded translog for primary term = {}, generation = {}, maxSeqNo = {}, minRemoteGenReferenced = {}",
689689
primaryTerm,
690690
generation,
691-
maxSeqNo
691+
maxSeqNo,
692+
minRemoteGenReferenced
692693
);
693694
}
694695

server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferMetadata.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,16 @@ public static Tuple<Long, Long> getMinMaxTranslogGenerationFromFilename(String f
170170
}
171171
}
172172

173+
public static long getMaxGenerationFromFileName(String filename) {
174+
String[] tokens = filename.split(METADATA_SEPARATOR);
175+
try {
176+
return RemoteStoreUtils.invertLong(tokens[2]);
177+
} catch (Exception e) {
178+
logger.error(() -> new ParameterizedMessage("Exception while getting max generation from: {}", filename), e);
179+
return -1;
180+
}
181+
}
182+
173183
public static Tuple<Long, Long> getMinMaxPrimaryTermFromFilename(String filename) {
174184
String[] tokens = filename.split(METADATA_SEPARATOR);
175185
if (tokens.length < 7) {

0 commit comments

Comments
 (0)