From c93daf0889e17bf7cf9184d12c4aff9ed4d5084c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 6 Aug 2025 20:48:35 +0300 Subject: [PATCH] Refactor SLRU download interface between Postgres and the neon extension Move the responsibility of writing the SLRU segment contents to the neon extension. The neon extension hook now downloads the file and places it to the correct path, and returns a true/false to the caller to indicate if the file existed. On successful download, the caller retries opening the file. While we're at it, refactor the read_slru_segment function so that it is not part of the SMGR api anymore, but an independent hook. We were previously pretending it's part of the SMGR api and because of that, passed a dummy SMgrRelation, but it was a bit silly. --- src/backend/access/transam/slru.c | 136 +++++++++++------------------- src/backend/storage/smgr/smgr.c | 31 ++++--- src/backend/utils/init/globals.c | 1 - src/include/miscadmin.h | 6 -- src/include/storage/smgr.h | 8 +- 5 files changed, 74 insertions(+), 108 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index b270e935a44..82fbc1385d9 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -736,71 +736,29 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) /* - * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer - * to download them on demand to reduce startup time. - * If SLRU segment is not found, we try to download it from page server + * NEON: we do not want to include large pg_xact/multixact files in the + * basebackup and prefer to download them on demand to reduce startup time. + * + * If SLRU segment is not found, we try to download it from the pageserver. + * + * Returns: + * true if the file was successfully downloaded + * false if the file was not found in pageserver + * ereports if some other error happened */ -static int -SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const* path) +static bool +SimpleLruDownloadSegment(SlruCtl ctl, int pageno, char const *path) { SlruShared shared = ctl->shared; - int segno; - int fd = -1; - int n_blocks; - char* buffer; - - static SMgrRelationData dummy_smgr_rel = {0}; + int segno; /* If page is greater than latest written page, then do not try to download segment from server */ if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number), pageno)) - return -1; + return false; - if (!dummy_smgr_rel.smgr) - { - RelFileLocator rlocator = {0}; - dummy_smgr_rel.smgr = smgr(INVALID_PROC_NUMBER, rlocator); - } segno = pageno / SLRU_PAGES_PER_SEGMENT; - if (neon_use_communicator_worker) { - buffer = NULL; - } else { - buffer = palloc(BLCKSZ * SLRU_PAGES_PER_SEGMENT); - } - - n_blocks = smgr_read_slru_segment(&dummy_smgr_rel, path, segno, buffer); - if (n_blocks > 0) - { - fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); - if (fd < 0) - { - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - pfree(buffer); - return -1; - } - - if (!neon_use_communicator_worker) { - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (pg_pwrite(fd, buffer, n_blocks*BLCKSZ, 0) != n_blocks*BLCKSZ) - { - pgstat_report_wait_end(); - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - slru_errcause = SLRU_WRITE_FAILED; - slru_errno = errno; - - CloseTransientFile(fd); - pfree(buffer); - return -1; - } - pgstat_report_wait_end(); - } - } - pfree(buffer); - return fd; + return smgr_read_slru_segment(path, segno); } /* @@ -819,29 +777,34 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno) int fd; bool result; off_t endpos; + bool attempted_download = false; /* update the stats counter of checked pages */ pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx); SlruFileName(ctl, path, segno); + retry_after_download: fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) { - /* expected: file doesn't exist */ - if (errno == ENOENT) + if (errno == ENOENT && !attempted_download) { - fd = SimpleLruDownloadSegment(ctl, pageno, path); - if (fd < 0) - return false; - } - else - { - /* report error normally */ - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); + /* Try to download the file from the pageserver */ + attempted_download = true; + if (SimpleLruDownloadSegment(ctl, pageno, path)) + goto retry_after_download; + errno = ENOENT; } + + /* expected: file doesn't exist */ + if (errno == ENOENT) + return false; + + /* report error normally */ + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + SlruReportIOError(ctl, pageno, 0); } if ((endpos = lseek(fd, 0, SEEK_END)) < 0) @@ -882,6 +845,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno) off_t offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; + bool attempted_download = false; SlruFileName(ctl, path, segno); @@ -892,33 +856,31 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno) * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case * where the file doesn't exist, and return zeroes instead. */ + retry_after_download: fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); if (fd < 0) { - if (errno != ENOENT) + if (errno == ENOENT && !attempted_download) + { + /* Try to download the file from the pageserver */ + attempted_download = true; + if (SimpleLruDownloadSegment(ctl, pageno, path)) + goto retry_after_download; + errno = ENOENT; + } + + if (errno != ENOENT || !InRecovery) { slru_errcause = SLRU_OPEN_FAILED; slru_errno = errno; return false; } - fd = SimpleLruDownloadSegment(ctl, pageno, path); - if (fd < 0) - { - if (!InRecovery) - { - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - return false; - } - else - { - ereport(LOG, - (errmsg("file \"%s\" doesn't exist, reading as zeroes", - path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - return true; - } - } + + ereport(LOG, + (errmsg("file \"%s\" doesn't exist, reading as zeroes", + path))); + MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + return true; } errno = 0; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index ceb1c0040cf..fcee6e50deb 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -83,6 +83,13 @@ static const f_smgr smgr_md = { .smgr_registersync = mdregistersync, }; +/* + * SLRU download isn't really part of the smgr API, as SLRUs are not + * relations. But we define this here anyway, to keep it close to the smgr + * hooks in Neon. + */ +read_slru_segment_hook_type read_slru_segment_hook; + /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unpinned" SMgrRelation objects are chained together in a list. @@ -821,22 +828,24 @@ smgr_end_unlogged_build(SMgrRelation reln) } /* - * NEON: we do not want to include large pg_xact/multixact files in basebackup and prefer - * to download them on demand to reduce startup time. - * If SLRU segment is not found, we try to download it from page server + * NEON: Attempt to download an SLRU file from remote storage. + * + * To reduce startup time, we don't want to include large pg_xact/multixact + * files in the basebackup. Instead, we have this hook to download them on + * demand. If an SLRU segment is not found, the code in slru.c calls this to + * check if it can be downloaded from the pageserver. * - * This function returns number of blocks in segment. Usually it should be SLRU_PAGES_PER_SEGMENT but in case - * of partial segment, it can be smaller. Zero value means that segment doesn't exist. - * From Postgres point of view empty segment is the same as absent segment. + * If the segment is found in remote storage, the hook writes it to the local + * file and returns 'true'. If the file is not found, returns 'false'. */ -int -smgr_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer) +bool +smgr_read_slru_segment(const char *path, int segno) { - return (*reln->smgr).smgr_read_slru_segment ? (*reln->smgr).smgr_read_slru_segment(reln, path, segno, buffer) : 0; + if (read_slru_segment_hook) + return read_slru_segment_hook(path, segno); + return false; } - - /* * AtEOXact_SMgr * diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index e5d0f428361..cc61937eef7 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -116,7 +116,6 @@ pid_t PostmasterPid = 0; bool IsPostmasterEnvironment = false; bool IsUnderPostmaster = false; bool IsBinaryUpgrade = false; -bool neon_use_communicator_worker = false; bool ExitOnAnyError = false; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a9f8f668c5b..e5b60d2bc3e 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -170,12 +170,6 @@ extern PGDLLIMPORT pid_t PostmasterPid; extern PGDLLIMPORT bool IsPostmasterEnvironment; extern PGDLLIMPORT bool IsUnderPostmaster; extern PGDLLIMPORT bool IsBinaryUpgrade; -/* Whether the communicator worker is used or not. Defined here so that - * it is also accessible from the main postgres code easily without - * having to look up the value using strings and chain of other - * functions. - */ -extern PGDLLIMPORT bool neon_use_communicator_worker; extern PGDLLIMPORT bool ExitOnAnyError; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index fdb9a75b8e5..ad9868f1a71 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -130,8 +130,6 @@ typedef struct f_smgr void (*smgr_start_unlogged_build) (SMgrRelation reln); void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); void (*smgr_end_unlogged_build) (SMgrRelation reln); - - int (*smgr_read_slru_segment) (SMgrRelation reln, const char *path, int segno, void* buffer); } f_smgr; typedef void (*smgr_init_hook_type) (void); @@ -141,6 +139,10 @@ extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook; extern void smgr_init_standard(void); extern void smgr_shutdown_standard(void); +/* NEON: Hook for reading an SLRU segment from e.g. remote storage */ +typedef bool (*read_slru_segment_hook_type) (const char *path, int segno); +extern read_slru_segment_hook_type read_slru_segment_hook; + // Alternative implementation of calculate_database_size() typedef int64 (*dbsize_hook_type) (Oid dbOid); extern PGDLLIMPORT dbsize_hook_type dbsize_hook; @@ -210,6 +212,6 @@ extern void smgr_start_unlogged_build(SMgrRelation reln); extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); extern void smgr_end_unlogged_build(SMgrRelation reln); -extern int smgr_read_slru_segment(SMgrRelation reln, const char *path, int segno, void* buffer); +extern bool smgr_read_slru_segment(const char *path, int segno); #endif /* SMGR_H */