Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 40fa829

Browse files
Refactor MSC3030 /timestamp_to_event to move away from our snowflake pull from destination pattern (#14096)
1. `federation_client.timestamp_to_event(...)` now handles all `destination` looping and uses our generic `_try_destination_list(...)` helper. 2. Consistently handling `NotRetryingDestination` and `FederationDeniedError` across `get_pdu` , backfill, and the generic `_try_destination_list` which is used for many places we use this pattern. 3. `get_pdu(...)` now returns `PulledPduInfo` so we know which `destination` we ended up pulling the PDU from
1 parent 0d59ae7 commit 40fa829

File tree

7 files changed

+191
-126
lines changed

7 files changed

+191
-126
lines changed

changelog.d/14096.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor [MSC3030](https://github.com/matrix-org/matrix-spec-proposals/pull/3030) `/timestamp_to_event` endpoint to loop over federation destinations with standard pattern and error handling.

synapse/federation/federation_client.py

Lines changed: 109 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,18 @@
8080
T = TypeVar("T")
8181

8282

83+
@attr.s(frozen=True, slots=True, auto_attribs=True)
84+
class PulledPduInfo:
85+
"""
86+
A result object that stores the PDU and info about it like which homeserver we
87+
pulled it from (`pull_origin`)
88+
"""
89+
90+
pdu: EventBase
91+
# Which homeserver we pulled the PDU from
92+
pull_origin: str
93+
94+
8395
class InvalidResponseError(RuntimeError):
8496
"""Helper for _try_destination_list: indicates that the server returned a response
8597
we couldn't parse
@@ -114,7 +126,9 @@ def __init__(self, hs: "HomeServer"):
114126
self.hostname = hs.hostname
115127
self.signing_key = hs.signing_key
116128

117-
self._get_pdu_cache: ExpiringCache[str, EventBase] = ExpiringCache(
129+
# Cache mapping `event_id` to a tuple of the event itself and the `pull_origin`
130+
# (which server we pulled the event from)
131+
self._get_pdu_cache: ExpiringCache[str, Tuple[EventBase, str]] = ExpiringCache(
118132
cache_name="get_pdu_cache",
119133
clock=self._clock,
120134
max_len=1000,
@@ -352,11 +366,11 @@ async def _record_failure_callback(
352366
@tag_args
353367
async def get_pdu(
354368
self,
355-
destinations: Iterable[str],
369+
destinations: Collection[str],
356370
event_id: str,
357371
room_version: RoomVersion,
358372
timeout: Optional[int] = None,
359-
) -> Optional[EventBase]:
373+
) -> Optional[PulledPduInfo]:
360374
"""Requests the PDU with given origin and ID from the remote home
361375
servers.
362376
@@ -371,11 +385,11 @@ async def get_pdu(
371385
moving to the next destination. None indicates no timeout.
372386
373387
Returns:
374-
The requested PDU, or None if we were unable to find it.
388+
The requested PDU wrapped in `PulledPduInfo`, or None if we were unable to find it.
375389
"""
376390

377391
logger.debug(
378-
"get_pdu: event_id=%s from destinations=%s", event_id, destinations
392+
"get_pdu(event_id=%s): from destinations=%s", event_id, destinations
379393
)
380394

381395
# TODO: Rate limit the number of times we try and get the same event.
@@ -384,19 +398,25 @@ async def get_pdu(
384398
# it gets persisted to the database), so we cache the results of the lookup.
385399
# Note that this is separate to the regular get_event cache which caches
386400
# events once they have been persisted.
387-
event = self._get_pdu_cache.get(event_id)
401+
get_pdu_cache_entry = self._get_pdu_cache.get(event_id)
388402

403+
event = None
404+
pull_origin = None
405+
if get_pdu_cache_entry:
406+
event, pull_origin = get_pdu_cache_entry
389407
# If we don't see the event in the cache, go try to fetch it from the
390408
# provided remote federated destinations
391-
if not event:
409+
else:
392410
pdu_attempts = self.pdu_destination_tried.setdefault(event_id, {})
393411

412+
# TODO: We can probably refactor this to use `_try_destination_list`
394413
for destination in destinations:
395414
now = self._clock.time_msec()
396415
last_attempt = pdu_attempts.get(destination, 0)
397416
if last_attempt + PDU_RETRY_TIME_MS > now:
398417
logger.debug(
399-
"get_pdu: skipping destination=%s because we tried it recently last_attempt=%s and we only check every %s (now=%s)",
418+
"get_pdu(event_id=%s): skipping destination=%s because we tried it recently last_attempt=%s and we only check every %s (now=%s)",
419+
event_id,
400420
destination,
401421
last_attempt,
402422
PDU_RETRY_TIME_MS,
@@ -411,43 +431,48 @@ async def get_pdu(
411431
room_version=room_version,
412432
timeout=timeout,
413433
)
434+
pull_origin = destination
414435

415436
pdu_attempts[destination] = now
416437

417438
if event:
418439
# Prime the cache
419-
self._get_pdu_cache[event.event_id] = event
440+
self._get_pdu_cache[event.event_id] = (event, pull_origin)
420441

421442
# Now that we have an event, we can break out of this
422443
# loop and stop asking other destinations.
423444
break
424445

446+
except NotRetryingDestination as e:
447+
logger.info("get_pdu(event_id=%s): %s", event_id, e)
448+
continue
449+
except FederationDeniedError:
450+
logger.info(
451+
"get_pdu(event_id=%s): Not attempting to fetch PDU from %s because the homeserver is not on our federation whitelist",
452+
event_id,
453+
destination,
454+
)
455+
continue
425456
except SynapseError as e:
426457
logger.info(
427-
"Failed to get PDU %s from %s because %s",
458+
"get_pdu(event_id=%s): Failed to get PDU from %s because %s",
428459
event_id,
429460
destination,
430461
e,
431462
)
432463
continue
433-
except NotRetryingDestination as e:
434-
logger.info(str(e))
435-
continue
436-
except FederationDeniedError as e:
437-
logger.info(str(e))
438-
continue
439464
except Exception as e:
440465
pdu_attempts[destination] = now
441466

442467
logger.info(
443-
"Failed to get PDU %s from %s because %s",
468+
"get_pdu(event_id=): Failed to get PDU from %s because %s",
444469
event_id,
445470
destination,
446471
e,
447472
)
448473
continue
449474

450-
if not event:
475+
if not event or not pull_origin:
451476
return None
452477

453478
# `event` now refers to an object stored in `get_pdu_cache`. Our
@@ -459,7 +484,7 @@ async def get_pdu(
459484
event.room_version,
460485
)
461486

462-
return event_copy
487+
return PulledPduInfo(event_copy, pull_origin)
463488

464489
@trace
465490
@tag_args
@@ -699,12 +724,14 @@ async def _check_sigs_and_hash_and_fetch_one(
699724
pdu_origin = get_domain_from_id(pdu.sender)
700725
if not res and pdu_origin != origin:
701726
try:
702-
res = await self.get_pdu(
727+
pulled_pdu_info = await self.get_pdu(
703728
destinations=[pdu_origin],
704729
event_id=pdu.event_id,
705730
room_version=room_version,
706731
timeout=10000,
707732
)
733+
if pulled_pdu_info is not None:
734+
res = pulled_pdu_info.pdu
708735
except SynapseError:
709736
pass
710737

@@ -806,6 +833,7 @@ async def _try_destination_list(
806833
)
807834

808835
for destination in destinations:
836+
# We don't want to ask our own server for information we don't have
809837
if destination == self.server_name:
810838
continue
811839

@@ -814,9 +842,21 @@ async def _try_destination_list(
814842
except (
815843
RequestSendFailed,
816844
InvalidResponseError,
817-
NotRetryingDestination,
818845
) as e:
819846
logger.warning("Failed to %s via %s: %s", description, destination, e)
847+
# Skip to the next homeserver in the list to try.
848+
continue
849+
except NotRetryingDestination as e:
850+
logger.info("%s: %s", description, e)
851+
continue
852+
except FederationDeniedError:
853+
logger.info(
854+
"%s: Not attempting to %s from %s because the homeserver is not on our federation whitelist",
855+
description,
856+
description,
857+
destination,
858+
)
859+
continue
820860
except UnsupportedRoomVersionError:
821861
raise
822862
except HttpResponseException as e:
@@ -1609,6 +1649,54 @@ async def send_request(
16091649
return result
16101650

16111651
async def timestamp_to_event(
1652+
self, *, destinations: List[str], room_id: str, timestamp: int, direction: str
1653+
) -> Optional["TimestampToEventResponse"]:
1654+
"""
1655+
Calls each remote federating server from `destinations` asking for their closest
1656+
event to the given timestamp in the given direction until we get a response.
1657+
Also validates the response to always return the expected keys or raises an
1658+
error.
1659+
1660+
Args:
1661+
destinations: The domains of homeservers to try fetching from
1662+
room_id: Room to fetch the event from
1663+
timestamp: The point in time (inclusive) we should navigate from in
1664+
the given direction to find the closest event.
1665+
direction: ["f"|"b"] to indicate whether we should navigate forward
1666+
or backward from the given timestamp to find the closest event.
1667+
1668+
Returns:
1669+
A parsed TimestampToEventResponse including the closest event_id
1670+
and origin_server_ts or None if no destination has a response.
1671+
"""
1672+
1673+
async def _timestamp_to_event_from_destination(
1674+
destination: str,
1675+
) -> TimestampToEventResponse:
1676+
return await self._timestamp_to_event_from_destination(
1677+
destination, room_id, timestamp, direction
1678+
)
1679+
1680+
try:
1681+
# Loop through each homeserver candidate until we get a succesful response
1682+
timestamp_to_event_response = await self._try_destination_list(
1683+
"timestamp_to_event",
1684+
destinations,
1685+
# TODO: The requested timestamp may lie in a part of the
1686+
# event graph that the remote server *also* didn't have,
1687+
# in which case they will have returned another event
1688+
# which may be nowhere near the requested timestamp. In
1689+
# the future, we may need to reconcile that gap and ask
1690+
# other homeservers, and/or extend `/timestamp_to_event`
1691+
# to return events on *both* sides of the timestamp to
1692+
# help reconcile the gap faster.
1693+
_timestamp_to_event_from_destination,
1694+
)
1695+
return timestamp_to_event_response
1696+
except SynapseError:
1697+
return None
1698+
1699+
async def _timestamp_to_event_from_destination(
16121700
self, destination: str, room_id: str, timestamp: int, direction: str
16131701
) -> "TimestampToEventResponse":
16141702
"""

synapse/handlers/federation.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,15 @@ async def try_backfill(domains: Collection[str]) -> bool:
442442
# appropriate stuff.
443443
# TODO: We can probably do something more intelligent here.
444444
return True
445+
except NotRetryingDestination as e:
446+
logger.info("_maybe_backfill_inner: %s", e)
447+
continue
448+
except FederationDeniedError:
449+
logger.info(
450+
"_maybe_backfill_inner: Not attempting to backfill from %s because the homeserver is not on our federation whitelist",
451+
dom,
452+
)
453+
continue
445454
except (SynapseError, InvalidResponseError) as e:
446455
logger.info("Failed to backfill from %s because %s", dom, e)
447456
continue
@@ -477,15 +486,9 @@ async def try_backfill(domains: Collection[str]) -> bool:
477486

478487
logger.info("Failed to backfill from %s because %s", dom, e)
479488
continue
480-
except NotRetryingDestination as e:
481-
logger.info(str(e))
482-
continue
483489
except RequestSendFailed as e:
484490
logger.info("Failed to get backfill from %s because %s", dom, e)
485491
continue
486-
except FederationDeniedError as e:
487-
logger.info(e)
488-
continue
489492
except Exception as e:
490493
logger.exception("Failed to backfill from %s because %s", dom, e)
491494
continue

synapse/handlers/federation_event.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
)
5959
from synapse.events import EventBase
6060
from synapse.events.snapshot import EventContext
61-
from synapse.federation.federation_client import InvalidResponseError
61+
from synapse.federation.federation_client import InvalidResponseError, PulledPduInfo
6262
from synapse.logging.context import nested_logging_context
6363
from synapse.logging.opentracing import (
6464
SynapseTags,
@@ -1517,8 +1517,8 @@ async def _handle_marker_event(self, origin: str, marker_event: EventBase) -> No
15171517
)
15181518

15191519
async def backfill_event_id(
1520-
self, destination: str, room_id: str, event_id: str
1521-
) -> EventBase:
1520+
self, destinations: List[str], room_id: str, event_id: str
1521+
) -> PulledPduInfo:
15221522
"""Backfill a single event and persist it as a non-outlier which means
15231523
we also pull in all of the state and auth events necessary for it.
15241524
@@ -1530,38 +1530,35 @@ async def backfill_event_id(
15301530
Raises:
15311531
FederationError if we are unable to find the event from the destination
15321532
"""
1533-
logger.info(
1534-
"backfill_event_id: event_id=%s from destination=%s", event_id, destination
1535-
)
1533+
logger.info("backfill_event_id: event_id=%s", event_id)
15361534

15371535
room_version = await self._store.get_room_version(room_id)
15381536

1539-
event_from_response = await self._federation_client.get_pdu(
1540-
[destination],
1537+
pulled_pdu_info = await self._federation_client.get_pdu(
1538+
destinations,
15411539
event_id,
15421540
room_version,
15431541
)
15441542

1545-
if not event_from_response:
1543+
if not pulled_pdu_info:
15461544
raise FederationError(
15471545
"ERROR",
15481546
404,
1549-
"Unable to find event_id=%s from destination=%s to backfill."
1550-
% (event_id, destination),
1547+
f"Unable to find event_id={event_id} from remote servers to backfill.",
15511548
affected=event_id,
15521549
)
15531550

15541551
# Persist the event we just fetched, including pulling all of the state
15551552
# and auth events to de-outlier it. This also sets up the necessary
15561553
# `state_groups` for the event.
15571554
await self._process_pulled_events(
1558-
destination,
1559-
[event_from_response],
1555+
pulled_pdu_info.pull_origin,
1556+
[pulled_pdu_info.pdu],
15601557
# Prevent notifications going to clients
15611558
backfilled=True,
15621559
)
15631560

1564-
return event_from_response
1561+
return pulled_pdu_info
15651562

15661563
@trace
15671564
@tag_args
@@ -1584,19 +1581,19 @@ async def _get_events_and_persist(
15841581
async def get_event(event_id: str) -> None:
15851582
with nested_logging_context(event_id):
15861583
try:
1587-
event = await self._federation_client.get_pdu(
1584+
pulled_pdu_info = await self._federation_client.get_pdu(
15881585
[destination],
15891586
event_id,
15901587
room_version,
15911588
)
1592-
if event is None:
1589+
if pulled_pdu_info is None:
15931590
logger.warning(
15941591
"Server %s didn't return event %s",
15951592
destination,
15961593
event_id,
15971594
)
15981595
return
1599-
events.append(event)
1596+
events.append(pulled_pdu_info.pdu)
16001597

16011598
except Exception as e:
16021599
logger.warning(

0 commit comments

Comments
 (0)