2323from typing_extensions import Final , TypeAlias as _TypeAlias
2424
2525from mypy .fscache import FileSystemCache
26+ from mypy .nodes import MypyFile
2627from mypy .options import Options
2728from mypy .stubinfo import is_legacy_bundled_package
2829from mypy import pyinfo
@@ -126,6 +127,33 @@ def __repr__(self) -> str:
126127 self .base_dir )
127128
128129
130+ class BuildSourceSet :
131+ """Helper to efficiently test a file's membership in a set of build sources."""
132+
133+ def __init__ (self , sources : List [BuildSource ]) -> None :
134+ self .source_text_present = False
135+ self .source_modules = {} # type: Dict[str, str]
136+ self .source_paths = set () # type: Set[str]
137+
138+ for source in sources :
139+ if source .text is not None :
140+ self .source_text_present = True
141+ if source .path :
142+ self .source_paths .add (source .path )
143+ if source .module :
144+ self .source_modules [source .module ] = source .path or ''
145+
146+ def is_source (self , file : MypyFile ) -> bool :
147+ if file .path and file .path in self .source_paths :
148+ return True
149+ elif file ._fullname in self .source_modules :
150+ return True
151+ elif self .source_text_present :
152+ return True
153+ else :
154+ return False
155+
156+
129157class FindModuleCache :
130158 """Module finder with integrated cache.
131159
@@ -141,8 +169,10 @@ def __init__(self,
141169 search_paths : SearchPaths ,
142170 fscache : Optional [FileSystemCache ],
143171 options : Optional [Options ],
144- stdlib_py_versions : Optional [StdlibVersions ] = None ) -> None :
172+ stdlib_py_versions : Optional [StdlibVersions ] = None ,
173+ source_set : Optional [BuildSourceSet ] = None ) -> None :
145174 self .search_paths = search_paths
175+ self .source_set = source_set
146176 self .fscache = fscache or FileSystemCache ()
147177 # Cache for get_toplevel_possibilities:
148178 # search_paths -> (toplevel_id -> list(package_dirs))
@@ -164,6 +194,53 @@ def clear(self) -> None:
164194 self .initial_components .clear ()
165195 self .ns_ancestors .clear ()
166196
197+ def find_module_via_source_set (self , id : str ) -> Optional [ModuleSearchResult ]:
198+ """Fast path to find modules by looking through the input sources
199+
200+ This is only used when --fast-module-lookup is passed on the command line."""
201+ if not self .source_set :
202+ return None
203+
204+ p = self .source_set .source_modules .get (id , None )
205+ if p and self .fscache .isfile (p ):
206+ # We need to make sure we still have __init__.py all the way up
207+ # otherwise we might have false positives compared to slow path
208+ # in case of deletion of init files, which is covered by some tests.
209+ # TODO: are there some combination of flags in which this check should be skipped?
210+ d = os .path .dirname (p )
211+ for _ in range (id .count ('.' )):
212+ if not any (self .fscache .isfile (os .path .join (d , '__init__' + x ))
213+ for x in PYTHON_EXTENSIONS ):
214+ return None
215+ d = os .path .dirname (d )
216+ return p
217+
218+ idx = id .rfind ('.' )
219+ if idx != - 1 :
220+ # When we're looking for foo.bar.baz and can't find a matching module
221+ # in the source set, look up for a foo.bar module.
222+ parent = self .find_module_via_source_set (id [:idx ])
223+ if parent is None or not isinstance (parent , str ):
224+ return None
225+
226+ basename , ext = os .path .splitext (parent )
227+ if (not any (parent .endswith ('__init__' + x ) for x in PYTHON_EXTENSIONS )
228+ and (ext in PYTHON_EXTENSIONS and not self .fscache .isdir (basename ))):
229+ # If we do find such a *module* (and crucially, we don't want a package,
230+ # hence the filtering out of __init__ files, and checking for the presence
231+ # of a folder with a matching name), then we can be pretty confident that
232+ # 'baz' will either be a top-level variable in foo.bar, or will not exist.
233+ #
234+ # Either way, spelunking in other search paths for another 'foo.bar.baz'
235+ # module should be avoided because:
236+ # 1. in the unlikely event that one were found, it's highly likely that
237+ # it would be unrelated to the source being typechecked and therefore
238+ # more likely to lead to erroneous results
239+ # 2. as described in _find_module, in some cases the search itself could
240+ # potentially waste significant amounts of time
241+ return ModuleNotFoundReason .NOT_FOUND
242+ return None
243+
167244 def find_lib_path_dirs (self , id : str , lib_path : Tuple [str , ...]) -> PackageDirs :
168245 """Find which elements of a lib_path have the directory a module needs to exist.
169246
@@ -229,7 +306,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
229306 elif top_level in self .stdlib_py_versions :
230307 use_typeshed = self ._typeshed_has_version (top_level )
231308 self .results [id ] = self ._find_module (id , use_typeshed )
232- if (not fast_path
309+ if (not ( fast_path or ( self . options is not None and self . options . fast_module_lookup ))
233310 and self .results [id ] is ModuleNotFoundReason .NOT_FOUND
234311 and self ._can_find_module_in_parent_dir (id )):
235312 self .results [id ] = ModuleNotFoundReason .WRONG_WORKING_DIRECTORY
@@ -295,6 +372,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
295372 def _find_module (self , id : str , use_typeshed : bool ) -> ModuleSearchResult :
296373 fscache = self .fscache
297374
375+ # Fast path for any modules in the current source set.
376+ # This is particularly important when there are a large number of search
377+ # paths which share the first (few) component(s) due to the use of namespace
378+ # packages, for instance:
379+ # foo/
380+ # company/
381+ # __init__.py
382+ # foo/
383+ # bar/
384+ # company/
385+ # __init__.py
386+ # bar/
387+ # baz/
388+ # company/
389+ # __init__.py
390+ # baz/
391+ #
392+ # mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
393+ # and computes [foo, bar, baz, ...] as the module search path.
394+ #
395+ # This would result in O(n) search for every import of company.*, leading to
396+ # O(n**2) behavior in load_graph as such imports are unsurprisingly present
397+ # at least once, and usually many more times than that, in each and every file
398+ # being parsed.
399+ #
400+ # Thankfully, such cases are efficiently handled by looking up the module path
401+ # via BuildSourceSet.
402+ p = (self .find_module_via_source_set (id )
403+ if (self .options is not None and self .options .fast_module_lookup )
404+ else None )
405+ if p :
406+ return p
407+
298408 # If we're looking for a module like 'foo.bar.baz', it's likely that most of the
299409 # many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
300410 # that only once and cache it for when we look for modules like 'foo.bar.blah'
0 commit comments