Mutual columns

erezsh · erezsh · commit 6a4c44385375 · 2022-08-01T15:27:31.000+02:00
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -5,13 +5,14 @@
 import logging
 from itertools import islice
 
-from .utils import remove_password_from_url
+from .utils import remove_password_from_url, safezip
 
 from .diff_tables import (
     TableSegment,
     TableDiffer,
     DEFAULT_BISECTION_THRESHOLD,
     DEFAULT_BISECTION_FACTOR,
+    create_schema,
 )
 from .databases.connect import connect
 from .parse_time import parse_time_before_now, UNITS_STR, ParseError
@@ -39,6 +40,11 @@ def _remove_passwords_in_dict(d: dict):
             d[k] = remove_password_from_url(v)
 
 
+def _get_schema(pair):
+    db, table_path = pair
+    return db.query_table_schema(table_path)
+
+
 @click.command()
 @click.argument("database1", required=False)
 @click.argument("table1", required=False)
@@ -67,7 +73,12 @@ def _remove_passwords_in_dict(d: dict):
 @click.option("--json", "json_output", is_flag=True, help="Print JSONL output for machine readability")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
-@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
+@click.option(
+    "--case-sensitive",
+    is_flag=True,
+    help="Column names are treated as case-sensitive. Otherwise, correct case according to schema.",
+)
+@click.option("--mutual-columns", is_flag=True, help="XXX")
 @click.option(
     "-j",
     "--threads",
@@ -111,7 +122,8 @@ def _main(
     verbose,
     interactive,
     threads,
-    keep_column_case,
+    case_sensitive,
+    mutual_columns,
     json_output,
     where,
     threads1=None,
@@ -158,35 +170,53 @@ def _main(
 
     db1 = connect(database1, threads1 or threads)
     db2 = connect(database2, threads2 or threads)
+    dbs = db1, db2
 
     if interactive:
-        db1.enable_interactive()
-        db2.enable_interactive()
+        for db in dbs:
+            db.enable_interactive()
 
     start = time.time()
 
     try:
         options = dict(
             min_update=max_age and parse_time_before_now(max_age),
             max_update=min_age and parse_time_before_now(min_age),
-            case_sensitive=keep_column_case,
+            case_sensitive=case_sensitive,
             where=where,
         )
     except ParseError as e:
         logging.error("Error while parsing age expression: %s" % e)
         return
 
-    table1_seg = TableSegment(db1, db1.parse_table_name(table1), key_column, update_column, columns, **options)
-    table2_seg = TableSegment(db2, db2.parse_table_name(table2), key_column, update_column, columns, **options)
-
     differ = TableDiffer(
         bisection_factor=bisection_factor,
         bisection_threshold=bisection_threshold,
         threaded=threaded,
         max_threadpool_size=threads and threads * 2,
         debug=debug,
     )
-    diff_iter = differ.diff_tables(table1_seg, table2_seg)
+
+    table_names = table1, table2
+    table_paths = [db.parse_table_name(t) for db, t in safezip(dbs, table_names)]
+
+    schemas = list(differ._thread_map(_get_schema, safezip(dbs, table_paths)))
+    schema1, schema2 = schemas = [
+        create_schema(db, table_path, schema, case_sensitive)
+        for db, table_path, schema in safezip(dbs, table_paths, schemas)
+    ]
+
+    if mutual_columns:
+        mutual = schema1.keys() & schema2.keys()  # Case-aware, according to case_sensitive
+        provided_columns = {key_column, update_column} | set(columns)
+        columns += tuple(mutual - provided_columns)
+
+    segments = [
+        TableSegment(db, table_path, key_column, update_column, columns, **options)._with_raw_schema(raw_schema)
+        for db, table_path, raw_schema in safezip(dbs, table_paths, schemas)
+    ]
+
+    diff_iter = differ.diff_tables(*segments)
 
     if limit:
         diff_iter = islice(diff_iter, int(limit))
diff --git a/data_diff/databases/base.py b/data_diff/databases/base.py
@@ -7,7 +7,7 @@
 import threading
 from abc import abstractmethod
 
-from data_diff.utils import is_uuid, safezip
+from data_diff.utils import CaseAwareMapping, is_uuid, safezip
 from .database_types import (
     AbstractDatabase,
     ColType,
@@ -180,16 +180,19 @@ def select_table_schema(self, path: DbPath) -> str:
             f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
         )
 
-    def query_table_schema(self, path: DbPath, filter_columns: Optional[Sequence[str]] = None) -> Dict[str, ColType]:
+    def query_table_schema(self, path: DbPath) -> Dict[str, ColType]:
         rows = self.query(self.select_table_schema(path), list)
         if not rows:
             raise RuntimeError(f"{self.name}: Table '{'.'.join(path)}' does not exist, or has no columns")
 
-        if filter_columns is not None:
-            accept = {i.lower() for i in filter_columns}
-            rows = [r for r in rows if r[0].lower() in accept]
+        d = {r[0]: r for r in rows}
+        assert len(d) == len(rows)
+        return d
 
-        col_dict: Dict[str, ColType] = {row[0]: self._parse_type(path, *row) for row in rows}
+    def _process_table_schema(self, path: DbPath, raw_schema: dict, filter_columns: Sequence[str]):
+        accept = {i.lower() for i in filter_columns}
+
+        col_dict = {name: self._parse_type(path, *row) for name, row in raw_schema.items() if name.lower() in accept}
 
         self._refine_coltypes(path, col_dict)
 
diff --git a/data_diff/databases/database_types.py b/data_diff/databases/database_types.py
@@ -1,11 +1,11 @@
 import decimal
 from abc import ABC, abstractmethod
-from typing import Sequence, Optional, Tuple, Union, Dict, List
+from typing import Mapping, Sequence, Optional, Tuple, Union, Dict, List
 from datetime import datetime
 
 from runtype import dataclass
 
-from data_diff.utils import ArithAlphanumeric, ArithUUID, ArithString
+from data_diff.utils import ArithAlphanumeric, ArithUUID, CaseAwareMapping
 
 
 DbPath = Tuple[str, ...]
@@ -254,44 +254,4 @@ def _normalize_table_path(self, path: DbPath) -> DbPath:
         ...
 
 
-class Schema(ABC):
-    @abstractmethod
-    def get_key(self, key: str) -> str:
-        ...
-
-    @abstractmethod
-    def __getitem__(self, key: str) -> ColType:
-        ...
-
-    @abstractmethod
-    def __setitem__(self, key: str, value):
-        ...
-
-    @abstractmethod
-    def __contains__(self, key: str) -> bool:
-        ...
-
-
-class Schema_CaseSensitive(dict, Schema):
-    def get_key(self, key):
-        return key
-
-
-class Schema_CaseInsensitive(Schema):
-    def __init__(self, initial):
-        self._dict = {k.lower(): (k, v) for k, v in dict(initial).items()}
-
-    def get_key(self, key: str) -> str:
-        return self._dict[key.lower()][0]
-
-    def __getitem__(self, key: str) -> ColType:
-        return self._dict[key.lower()][1]
-
-    def __setitem__(self, key: str, value):
-        k = key.lower()
-        if k in self._dict:
-            key = self._dict[k][0]
-        self._dict[k] = key, value
-
-    def __contains__(self, key):
-        return key.lower() in self._dict
+Schema = CaseAwareMapping
diff --git a/data_diff/databases/mysql.py b/data_diff/databases/mysql.py
@@ -47,7 +47,7 @@ def create_connection(self):
             elif e.errno == mysql.errorcode.ER_BAD_DB_ERROR:
                 raise ConnectError("Database does not exist") from e
             else:
-                raise ConnectError(*e._args) from e
+                raise ConnectError(*e) from e
 
     def quote(self, s: str):
         return f"`{s}`"
diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -13,7 +13,7 @@
 from runtype import dataclass
 
 from .sql import Select, Checksum, Compare, DbPath, DbKey, DbTime, Count, TableName, Time, Value
-from .utils import safezip, split_space
+from .utils import CaseInsensitiveDict, safezip, split_space, CaseSensitiveDict
 from .databases.base import Database
 from .databases.database_types import (
     ArithString,
@@ -23,8 +23,6 @@
     PrecisionType,
     StringType,
     Schema,
-    Schema_CaseInsensitive,
-    Schema_CaseSensitive,
 )
 
 logger = logging.getLogger("diff_tables")
@@ -35,6 +33,18 @@
 DEFAULT_BISECTION_FACTOR = 32
 
 
+def create_schema(db: Database, table_path: DbPath, schema: dict, case_sensitive: bool) -> Schema:
+    logger.debug(f"[{db.name}] Schema = {schema}")
+
+    if case_sensitive:
+        return CaseSensitiveDict(schema)
+
+    if len({k.lower() for k in schema}) < len(schema):
+        logger.warning(f'Ambiguous schema for {db}:{".".join(table_path)} | Columns = {", ".join(list(schema))}')
+        logger.warning("We recommend to disable case-insensitivity (remove --any-case).")
+    return CaseInsensitiveDict(schema)
+
+
 @dataclass(frozen=False)
 class TableSegment:
     """Signifies a segment of rows (and selected columns) within a table
@@ -116,26 +126,16 @@ def _normalize_column(self, name: str, template: str = None) -> str:
 
         return self.database.normalize_value_by_type(col, col_type)
 
+    def _with_raw_schema(self, raw_schema: dict) -> "TableSegment":
+        schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns)
+        return self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive))
+
     def with_schema(self) -> "TableSegment":
         "Queries the table schema from the database, and returns a new instance of TableSegment, with a schema."
         if self._schema:
             return self
 
-        schema = self.database.query_table_schema(self.table_path, self._relevant_columns)
-        logger.debug(f"[{self.database.name}] Schema = {schema}")
-
-        schema_inst: Schema
-        if self.case_sensitive:
-            schema_inst = Schema_CaseSensitive(schema)
-        else:
-            if len({k.lower() for k in schema}) < len(schema):
-                logger.warning(
-                    f'Ambiguous schema for {self.database}:{".".join(self.table_path)} | Columns = {", ".join(list(schema))}'
-                )
-                logger.warning("We recommend to disable case-insensitivity (remove --any-case).")
-            schema_inst = Schema_CaseInsensitive(schema)
-
-        return self.new(_schema=schema_inst)
+        return self._with_raw_schema(self.database.query_table_schema(self.table_path))
 
     def _make_key_range(self):
         if self.min_key is not None:
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -1,7 +1,8 @@
 import math
+from typing import Iterable, Tuple, Union, Any
+from typing import TypeVar, Generic
+from abc import ABC, abstractmethod
 from urllib.parse import urlparse
-
-from typing import Union, Any
 from uuid import UUID
 import string
 
@@ -150,9 +151,64 @@ def remove_password_from_url(url: str, replace_with: str = "***") -> str:
     return replaced.geturl()
 
 
-def join_iter(joiner: Any, iterable: iter) -> iter:
+def join_iter(joiner: Any, iterable: Iterable) -> Iterable:
     it = iter(iterable)
     yield next(it)
     for i in it:
         yield joiner
         yield i
+
+
+V = TypeVar("V")
+
+
+class CaseAwareMapping(ABC, Generic[V]):
+    @abstractmethod
+    def get_key(self, key: str) -> str:
+        ...
+
+    @abstractmethod
+    def __getitem__(self, key: str) -> V:
+        ...
+
+    @abstractmethod
+    def __setitem__(self, key: str, value: V):
+        ...
+
+    @abstractmethod
+    def __contains__(self, key: str) -> bool:
+        ...
+
+
+class CaseInsensitiveDict(CaseAwareMapping):
+    def __init__(self, initial):
+        self._dict = {k.lower(): (k, v) for k, v in dict(initial).items()}
+
+    def get_key(self, key: str) -> str:
+        return self._dict[key.lower()][0]
+
+    def __getitem__(self, key: str) -> V:
+        return self._dict[key.lower()][1]
+
+    def __setitem__(self, key: str, value):
+        k = key.lower()
+        if k in self._dict:
+            key = self._dict[k][0]
+        self._dict[k] = key, value
+
+    def __contains__(self, key):
+        return key.lower() in self._dict
+
+    def keys(self) -> Iterable[str]:
+        return self._dict.keys()
+
+    def items(self) -> Iterable[Tuple[str, V]]:
+        return ((k, v[1]) for k, v in self._dict.items())
+
+
+class CaseSensitiveDict(dict, CaseAwareMapping):
+    def get_key(self, key):
+        return key
+
+    def as_insensitive(self):
+        return CaseInsensitiveDict(self)