glotzerlab · vyasr · Feb 10, 2021 · May 19, 2019 · May 19, 2019 · May 19, 2019
diff --git a/signac/contrib/filterparse.py b/signac/contrib/filterparse.py
@@ -68,19 +68,19 @@ def _cast(x):
                 return x
 
 
-def _parse_simple(key, value=None):
+def _parse_single(key, value=None):
     if value is None or value == '!':
-        return {key: {'$exists': True}}
+        return key, {'$exists': True}
     elif _is_json(value):
-        return {key: _parse_json(value)}
+        return key, _parse_json(value)
     elif _is_regex(value):
-        return {key: {'$regex': value[1:-1]}}
+        return key, {'$regex': value[1:-1]}
     elif _is_json(key):
         raise ValueError(
             "Please check your filter arguments. "
             "Using as JSON expression as key is not allowed: '{}'.".format(key))
     else:
-        return {key: _cast(value)}
+        return key, _cast(value)
 
 
 def parse_filter_arg(args, file=sys.stderr):
@@ -90,14 +90,59 @@ def parse_filter_arg(args, file=sys.stderr):
         if _is_json(args[0]):
             return _parse_json(args[0])
         else:
-            return _with_message(_parse_simple(args[0]), file)
+            key, value = _parse_single(args[0])
+            return _with_message({key: value}, file)
     else:
-        q = dict()
-        for i in range(0, len(args), 2):
-            key = args[i]
-            if i+1 < len(args):
-                value = args[i+1]
-            else:
-                value = None
-            q.update(_parse_simple(key, value))
+        q = dict(parse_simple(args))
+
         return _with_message(q, file)
+
+
+def parse_simple(tokens):
+    for i in range(0, len(tokens), 2):
+        key = tokens[i]
+        if i+1 < len(tokens):
+            value = tokens[i+1]
+        else:
+            value = None
+        yield _parse_single(key, value)
+
+
+def _add_prefix(filter, prefix):
+    for key, value in filter:
+        if key in ('$and', '$or'):
+            if isinstance(value, list) or isinstance(value, tuple):
+                yield key, [dict(_add_prefix(item.items(), prefix)) for item in value]
+            else:
+                raise ValueError(
+                    "The argument to a logical operator must be a sequence (e.g. a list)!")
+        elif '.' in key and key.split('.', 1)[0] in ('sp', 'doc'):
+            yield key, value
+        elif key in ('sp', 'doc'):
+            yield key, value
+        else:
+            yield prefix + '.' + key, value
+
+
+def _root_keys(filter):
+    for key, value in filter.items():
+        if key in ('$and', '$or'):
+            assert isinstance(value, (list, tuple))
+            for item in value:
+                for key in _root_keys(item):
+                    yield key
+        elif '.' in key:
+            yield key.split('.', 1)[0]
+        else:
+            yield key
+
+
+def _parse_filter(filter):
+    if isinstance(filter, str):
+        yield from parse_simple(filter.split())
+    elif filter:
+        yield from filter.items()
+
+
+def parse_filter(filter, prefix='sp'):
+    yield from _add_prefix(_parse_filter(filter), prefix)
diff --git a/signac/contrib/import_export.py b/signac/contrib/import_export.py
@@ -48,7 +48,7 @@ def _make_schema_based_path_function(jobs, exclude_keys=None, delimiter_nested='
         # signature of the path function below.
         return lambda job, sep=None: ''
 
-    index = [{'_id': job._id, 'statepoint': job.sp()} for job in jobs]
+    index = [{'_id': job._id, 'sp': job.sp()} for job in jobs]
     jsi = _build_job_statepoint_index(jobs=jobs, exclude_const=True, index=index)
     sp_index = OrderedDict(jsi)
 

diff --git a/signac/contrib/linked_view.py b/signac/contrib/linked_view.py
@@ -25,10 +25,10 @@ def create_linked_view(project, prefix=None, job_ids=None, index=None, path=None
 
     if index is None:
         if job_ids is None:
-            index = [{'_id': job._id, 'statepoint': job.sp()} for job in project]
+            index = [{'_id': job._id, 'sp': job.sp()} for job in project]
             jobs = list(project)
         else:
-            index = [{'_id': job_id, 'statepoint': project.open_job(id=job_id).sp()}
+            index = [{'_id': job_id, 'sp': project.open_job(id=job_id).sp()}
                      for job_id in job_ids]
             jobs = list(project.open_job(id=job_id) for job_id in job_ids)
     elif job_ids is not None:

diff --git a/signac/contrib/project.py b/signac/contrib/project.py
@@ -35,6 +35,7 @@
 from .errors import WorkspaceError
 from .errors import DestinationExistsError
 from .errors import JobsCorruptedError
+from .filterparse import parse_filter, _root_keys
 from .errors import IncompatibleSchemaVersion
 
 logger = logging.getLogger(__name__)
@@ -76,17 +77,23 @@ def __init__(self, index, _trust=False):
     def __len__(self):
         return len(self._collection)
 
-    def _resolve_statepoint_filter(self, q):
-        for k, v in q.items():
-            if k in ('$and', '$or'):
-                if not isinstance(v, list) or isinstance(v, tuple):
-                    raise ValueError(
-                        "The argument to a logical operator must be a sequence (e.g. a list)!")
-                yield k, [dict(self._resolve_statepoint_filter(i)) for i in v]
-            else:
-                yield 'statepoint.{}'.format(k), v
-
     def find_job_ids(self, filter=None, doc_filter=None):
+        """Find the job_ids of all jobs matching the filters.
+
+        The optional filter arguments must be a Mapping of key-value
+        pairs and JSON serializable.
+
+        :param filter: A mapping of key-value pairs that all
+            indexed job statepoints are compared against.
+        :type filter: Mapping
+        :param doc_filter: A mapping of key-value pairs that all
+            indexed job documents are compared against.
+        :yields: The ids of all indexed jobs matching both filters.
+        :raise TypeError: If the filters are not JSON serializable.
+        :raises ValueError: If the filters are invalid.
+        :raises RuntimeError: If the filters are not supported
+            by the index.
+        """
         if filter:
             filter = dict(self._resolve_statepoint_filter(filter))
             if doc_filter:
@@ -550,10 +557,9 @@ def build_job_statepoint_index(self, exclude_const=False, index=None):
         """
         from .schema import _build_job_statepoint_index
         if index is None:
-            index = [{'_id': job._id, 'statepoint': job.sp()} for job in self]
-        for x, y in _build_job_statepoint_index(
-                    jobs=self, exclude_const=exclude_const, index=index):
-            yield tuple(x.split('.')), y
+            index = [{'_id': job._id, 'sp': job.sp()} for job in self]
+        for x in _build_job_statepoint_index(jobs=self, exclude_const=exclude_const, index=index):
+            yield tuple(x.split('.'))
 
     def detect_schema(self, exclude_const=False, subset=None, index=None):
         """Detect the project's state point schema.
@@ -612,14 +618,15 @@ def _find_job_ids(self, filter=None, doc_filter=None, index=None):
         if filter is None and doc_filter is None and index is None:
             return list(self._job_dirs())
         if index is None:
-            if doc_filter is None:
-                index = self._sp_index()
-            else:
+            filter = dict(parse_filter(filter, 'sp'))
+            if doc_filter:
+                filter.update(parse_filter(doc_filter, 'doc'))
                 index = self.index(include_job_document=True)
-            search_index = JobSearchIndex(index, _trust=True)
-        else:
-            search_index = JobSearchIndex(index)
-        return search_index.find_job_ids(filter=filter, doc_filter=doc_filter)
+            elif 'doc' in _root_keys(filter):
+                index = self.index(include_job_document=True)
+            else:
+                index = self._sp_index()
+        return Collection(index, _trust=True)._find(filter)
 
     def find_jobs(self, filter=None, doc_filter=None):
         """Find all jobs in the project's workspace.
@@ -641,7 +648,10 @@ def find_jobs(self, filter=None, doc_filter=None):
         :raises RuntimeError: If the filters are not supported
             by the index.
         """
-        return JobsCursor(self, filter, doc_filter)
+        filter = dict(parse_filter(filter, 'sp'))
+        if doc_filter:
+            filter.update(parse_filter(doc_filter, 'doc'))
+        return JobsCursor(self, filter)
 
     def __iter__(self):
         return iter(self.find_jobs())
@@ -658,6 +668,14 @@ def groupby(self, key=None, default=None):
             for key, group in project.groupby('a'):
                 print(key, list(group))
 
+            # Group jobs by document value 'a'.
+            for key, group in project.groupby('doc.a'):
+                print(key, list(group))
+
+            # Group jobs by jobs.sp['a'] and job.document['b']
+            for key, group in project.groupby('a', 'doc.b'):
+                print(key, list(group))
+
             # Find jobs where job.sp['a'] is 1 and group them
             # by job.sp['b'] and job.sp['c'].
             for key, group in project.find_jobs({'a': 1}).groupby(('b', 'c')):
@@ -1228,7 +1246,7 @@ def repair(self, fn_statepoints=None, index=None, job_ids=None):
                 raise
         if index is not None:
             for doc in index:
-                self._sp_cache[doc['signac_id']] = doc['statepoint']
+                self._sp_cache[doc['signac_id']] = doc['sp']
 
         corrupted = []
         for job_id in job_ids:
@@ -1286,7 +1304,7 @@ def _sp_index(self):
         for _id in to_remove:
             del self._index_cache[_id]
         for _id in to_add:
-            self._index_cache[_id] = dict(statepoint=self._get_statepoint(_id), _id=_id)
+            self._index_cache[_id] = dict(sp=self._get_statepoint(_id), _id=_id)
         return self._index_cache.values()
 
     def _build_index(self, include_job_document=False):
@@ -1295,14 +1313,14 @@ def _build_index(self, include_job_document=False):
         """
         wd = self.workspace() if self.Job is Job else None
         for _id in self._find_job_ids():
-            doc = dict(_id=_id, statepoint=self._get_statepoint(_id))
+            doc = dict(_id=_id, sp=self._get_statepoint(_id))
             if include_job_document:
                 if wd is None:
-                    doc.update(self.open_job(id=_id).document)
+                    doc['doc'] = self.open_job(id=_id).document
                 else:   # use optimized path
                     try:
                         with open(os.path.join(wd, _id, self.Job.FN_DOCUMENT), 'rb') as file:
-                            doc.update(json.loads(file.read().decode()))
+                            doc['doc'] = json.loads(file.read().decode())
                     except IOError as error:
                         if error.errno != errno.ENOENT:
                             raise
@@ -1695,25 +1713,23 @@ class JobsCursor(object):
     """
     _use_pandas_for_html_repr = True  # toggle use of pandas for html repr
 
-    def __init__(self, project, filter, doc_filter):
+    def __init__(self, project, filter):
         self._project = project
         self._filter = filter
-        self._doc_filter = doc_filter
 
         # This private attribute allows us to implement the deprecated
         # next() method for this class.
         self._next_iter = None
 
     def __eq__(self, other):
-        return self._project == other._project and self._filter == other._filter\
-            and self._doc_filter == other._doc_filter
+        return self._project == other._project and self._filter == other._filter
 
     def __len__(self):
         # Highly performance critical code path!!
-        if self._filter or self._doc_filter:
+        if self._filter:
             # We use the standard function for determining job ids if and only if
             # any of the two filter is provided.
-            return len(self._project._find_job_ids(self._filter, self._doc_filter))
+            return len(self._project._find_job_ids(self._filter))
         else:
             # Without filter we can simply return the length of the whole project.
             return self._project.__len__()
@@ -1722,7 +1738,7 @@ def __iter__(self):
         # Code duplication here for improved performance.
         return _JobsCursorIterator(
             self._project,
-            self._project._find_job_ids(self._filter, self._doc_filter),
+            self._project._find_job_ids(self._filter)
             )
 
     def next(self):
@@ -1753,6 +1769,14 @@ def groupby(self, key=None, default=None):
             for key, group in project.groupby('a'):
                 print(key, list(group))
 
+            # Group jobs by document value 'a'.
+            for key, group in project.groupby('doc.a'):
+                print(key, list(group))
+
+            # Group jobs by jobs.sp['a'] and job.document['b']
+            for key, group in project.groupby('a', 'doc.b'):
+                print(key, list(group))
+
             # Find jobs where job.sp['a'] is 1 and group them
             # by job.sp['b'] and job.sp['c'].
             for key, group in project.find_jobs({'a': 1}).groupby(('b', 'c')):
@@ -1784,23 +1808,45 @@ def groupby(self, key=None, default=None):
                 else:
                     _filter = {'$and': [{key: {"$exists": True}}, _filter]}
 
-                def keyfunction(job):
-                    return job.sp[key]
+                if '.' in key and key.split('.', 1)[0] == 'doc':
+                    def keyfunction(job):
+                        return job.document[key[4:]]
+                else:
+                    key = key[3:] if '.' in key and key.split('.', 1)[0] == 'sp' else key
+
+                    def keyfunction(job):
+                        return job.sp[key]
             else:
-                def keyfunction(job):
-                    return job.sp.get(key, default)
+                if '.' in key and key.split('.', 1)[0] == 'doc':
+                    def keyfunction(job):
+                        return job.document.get(key, default)
+                else:
+                    key = key[3:] if '.' in key and key.split('.', 1)[0] == 'sp' else key
+
+                    def keyfunction(job):
+                        return job.sp.get(key, default)
+
         elif isinstance(key, Iterable):
+            sp_keys = []
+            doc_keys = []
+            for k in key:
+                if '.' in k and k.split('.', 1)[0] == 'doc':
+                    doc_keys.append(k[4:])
+                else:
+                    sp_keys.append(k[3:] if '.' in k and k.split('.', 1)[0] == 'sp' else k)
+
             if default is None:
                 if _filter is None:
                     _filter = {k: {"$exists": True} for k in key}
                 else:
                     _filter = {'$and': [{k: {"$exists": True} for k in key}, _filter]}
 
                 def keyfunction(job):
-                    return tuple(job.sp[k] for k in key)
+                    return tuple([job.sp[k] for k in sp_keys] + [job.document[k] for k in doc_keys])
             else:
                 def keyfunction(job):
-                    return tuple(job.sp.get(k, default) for k in key)
+                    return tuple([job.sp.get(k, default) for k in sp_keys] +
+                                 [job.document.get(k, default) for k in doc_keys])
         elif key is None:
             # Must return a type that can be ordered with <, >
             def keyfunction(job):
@@ -1809,7 +1855,7 @@ def keyfunction(job):
             # Pass the job document to a callable
             keyfunction = key
 
-        return groupby(sorted(iter(JobsCursor(self._project, _filter, self._doc_filter)),
+        return groupby(sorted(iter(JobsCursor(self._project, _filter)),
                               key=keyfunction), key=keyfunction)
 
     def groupbydoc(self, key=None, default=None):
@@ -1910,11 +1956,10 @@ def _export_sp_and_doc(job):
             orient='index').infer_objects()
 
     def __repr__(self):
-        return '{type}(project={project}, filter={filter}, doc_filter={doc_filter})'.format(
+        return '{type}(project={project}, filter={filter})'.format(
                    type=self.__class__.__name__,
                    project=repr(self._project),
-                   filter=repr(self._filter),
-                   doc_filter=repr(self._doc_filter))
+                   filter=repr(self._filter))
 
     def _repr_html_jobs(self):
         html = ''