@@ -39,22 +39,34 @@ def diff_sets(
3939 ignored_columns1 : Collection [str ],
4040 ignored_columns2 : Collection [str ],
4141) -> Iterator :
42- # Differ only by columns of interest (PKs+relevant-ignored). But yield with ignored ones!
43- sa : Set [_Row ] = {tuple (val for col , val in safezip (columns1 , row ) if col not in ignored_columns1 ) for row in a }
44- sb : Set [_Row ] = {tuple (val for col , val in safezip (columns2 , row ) if col not in ignored_columns2 ) for row in b }
45-
46- # The first items are always the PK (see TableSegment.relevant_columns)
47- diffs_by_pks : Dict [_PK , List [Tuple [_Op , _Row ]]] = defaultdict (list )
42+ # Group full rows by PKs on each side. The first items are the PK: TableSegment.relevant_columns
43+ rows_by_pks1 : Dict [_PK , List [_Row ]] = defaultdict (list )
44+ rows_by_pks2 : Dict [_PK , List [_Row ]] = defaultdict (list )
4845 for row in a :
4946 pk : _PK = tuple (val for col , val in zip (key_columns1 , row ))
50- cutrow : _Row = tuple (val for col , val in zip (columns1 , row ) if col not in ignored_columns1 )
51- if cutrow not in sb :
52- diffs_by_pks [pk ].append (("-" , row ))
47+ rows_by_pks1 [pk ].append (row )
5348 for row in b :
5449 pk : _PK = tuple (val for col , val in zip (key_columns2 , row ))
55- cutrow : _Row = tuple (val for col , val in zip (columns2 , row ) if col not in ignored_columns2 )
56- if cutrow not in sa :
57- diffs_by_pks [pk ].append (("+" , row ))
50+ rows_by_pks2 [pk ].append (row )
51+
52+ # Mind that the same pk MUST go in full with all the -/+ rows all at once, for grouping.
53+ diffs_by_pks : Dict [_PK , List [Tuple [_Op , _Row ]]] = defaultdict (list )
54+ for pk in sorted (set (rows_by_pks1 ) | set (rows_by_pks2 )):
55+ cutrows1 : List [_Row ] = [
56+ tuple (val for col , val in zip (columns1 , row1 ) if col not in ignored_columns1 ) for row1 in rows_by_pks1 [pk ]
57+ ]
58+ cutrows2 : List [_Row ] = [
59+ tuple (val for col , val in zip (columns2 , row2 ) if col not in ignored_columns2 ) for row2 in rows_by_pks2 [pk ]
60+ ]
61+
62+ # Either side has 0 rows: a clearly exclusive row.
63+ # Either side has 2+ rows: duplicates on either side, yield it all regardless of values.
64+ # Both sides == 1: non-duplicate, non-exclusive, so check for values of interest.
65+ if len (cutrows1 ) != 1 or len (cutrows2 ) != 1 or cutrows1 != cutrows2 :
66+ for row1 in rows_by_pks1 [pk ]:
67+ diffs_by_pks [pk ].append (("-" , row1 ))
68+ for row2 in rows_by_pks2 [pk ]:
69+ diffs_by_pks [pk ].append (("+" , row2 ))
5870
5971 warned_diff_cols = set ()
6072 for diffs in (diffs_by_pks [pk ] for pk in sorted (diffs_by_pks )):
0 commit comments