From e9d7757f83d4af6bbd061bf2d93c13360d65b8d6 Mon Sep 17 00:00:00 2001 From: Markus Schmaus Date: Fri, 2 Sep 2022 20:02:14 +0200 Subject: [PATCH 1/3] Add `start_sigma` to ADVI --- pymc/variational/approximations.py | 22 +++++++++++++++++++--- pymc/variational/inference.py | 13 +++++++++++-- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py index c0b9c7e3d6..9897232e75 100644 --- a/pymc/variational/approximations.py +++ b/pymc/variational/approximations.py @@ -67,17 +67,33 @@ def std(self): def __init_group__(self, group): super().__init_group__(group) if not self._check_user_params(): - self.shared_params = self.create_shared_params(self._kwargs.get("start", None)) + self.shared_params = self.create_shared_params( + self._kwargs.get("start", None), self._kwargs.get("start_sigma", None) + ) self._finalize_init() - def create_shared_params(self, start=None): + def create_shared_params(self, start=None, start_sigma=None): + # NOTE: `Group._prepare_start` uses `self.model.free_RVs` to identify free variables and + # `DictToArrayBijection` to turn them into a flat array, while `Approximation.rslice` assumes that the free + # variables are given by `self.group` and that the mapping between original variables and flat array is given + # by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or + # future code changes that break this assumption. start = self._prepare_start(start) - rho = np.zeros((self.ddim,)) + rho = self._prepare_start_sigma(start_sigma) return { "mu": aesara.shared(pm.floatX(start), "mu"), "rho": aesara.shared(pm.floatX(rho), "rho"), } + def _prepare_start_sigma(self, start_sigma): + rho = np.zeros((self.ddim,)) + if start_sigma is not None: + for name, slice_, *_ in self.ordering.items(): + sigma = start_sigma.get(name) + if sigma is not None: + rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0) + return rho + @node_property def symbolic_random(self): initial = self.symbolic_initial diff --git a/pymc/variational/inference.py b/pymc/variational/inference.py index 88b5e7b744..4d8f0d49de 100644 --- a/pymc/variational/inference.py +++ b/pymc/variational/inference.py @@ -433,8 +433,10 @@ class ADVI(KLqp): random_seed: None or int leave None to use package global RandomStream or other valid value to create instance specific one - start: `Point` + start: `dict[str, np.ndarray]` or `StartDict` starting point for inference + start_sigma: `dict[str, np.ndarray]` + starting standard deviation for inference, only available for method 'advi' References ---------- @@ -660,6 +662,7 @@ def fit( model=None, random_seed=None, start=None, + start_sigma=None, inf_kwargs=None, **kwargs, ): @@ -684,8 +687,10 @@ def fit( valid value to create instance specific one inf_kwargs: dict additional kwargs passed to :class:`Inference` - start: `Point` + start: `dict[str, np.ndarray]` or `StartDict` starting point for inference + start_sigma: `dict[str, np.ndarray]` + starting standard deviation for inference, only available for method 'advi' Other Parameters ---------------- @@ -728,6 +733,10 @@ def fit( inf_kwargs["random_seed"] = random_seed if start is not None: inf_kwargs["start"] = start + if start_sigma is not None: + if method != "advi": + raise NotImplementedError("start_sigma is only available for method advi") + inf_kwargs["start_sigma"] = start_sigma if model is None: model = pm.modelcontext(model) _select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD) From 1bd4fda2e70975f82fa0d7319b0dc3f42e6826f5 Mon Sep 17 00:00:00 2001 From: Markus Schmaus Date: Sat, 3 Sep 2022 13:07:15 +0200 Subject: [PATCH 2/3] add test for `start` and `start_sigma` plus minor fixes --- pymc/tests/test_variational_inference.py | 22 ++++++++++++++++++++++ pymc/variational/approximations.py | 2 +- pymc/variational/inference.py | 16 ++++++++++------ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/pymc/tests/test_variational_inference.py b/pymc/tests/test_variational_inference.py index c5b8a80cf4..7dbed2ab48 100644 --- a/pymc/tests/test_variational_inference.py +++ b/pymc/tests/test_variational_inference.py @@ -571,6 +571,28 @@ def test_fit_oo(inference, fit_kwargs, simple_model_data): np.testing.assert_allclose(np.std(trace.posterior["mu"]), np.sqrt(1.0 / d), rtol=0.2) +def test_fit_start(inference_spec, simple_model): + mu_init = 17 + mu_sigma_init = 13 + + with simple_model: + if type(inference_spec()) == ADVI: + has_start_sigma = True + else: + has_start_sigma = False + + kw = {"start": {"mu": mu_init}} + if has_start_sigma: + kw.update({"start_sigma": {"mu": mu_sigma_init}}) + + with simple_model: + inference = inference_spec(**kw) + trace = inference.fit(n=0).sample(10000) + np.testing.assert_allclose(np.mean(trace.posterior["mu"]), mu_init, rtol=0.05) + if has_start_sigma: + np.testing.assert_allclose(np.std(trace.posterior["mu"]), mu_sigma_init, rtol=0.05) + + def test_profile(inference): inference.run_profiling(n=100).summary() diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py index 9897232e75..b8c9135259 100644 --- a/pymc/variational/approximations.py +++ b/pymc/variational/approximations.py @@ -88,7 +88,7 @@ def create_shared_params(self, start=None, start_sigma=None): def _prepare_start_sigma(self, start_sigma): rho = np.zeros((self.ddim,)) if start_sigma is not None: - for name, slice_, *_ in self.ordering.items(): + for name, slice_, *_ in self.ordering.values(): sigma = start_sigma.get(name) if sigma is not None: rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0) diff --git a/pymc/variational/inference.py b/pymc/variational/inference.py index 4d8f0d49de..e932df1e57 100644 --- a/pymc/variational/inference.py +++ b/pymc/variational/inference.py @@ -257,7 +257,9 @@ def _infmean(input_array): ) ) else: - if n < 10: + if n == 0: + logger.info(f"Initialization only") + elif n < 10: logger.info(f"Finished [100%]: Loss = {scores[-1]:,.5g}") else: avg_loss = _infmean(scores[max(0, i - 1000) : i + 1]) @@ -466,7 +468,7 @@ class FullRankADVI(KLqp): random_seed: None or int leave None to use package global RandomStream or other valid value to create instance specific one - start: `Point` + start: `dict[str, np.ndarray]` or `StartDict` starting point for inference References @@ -534,13 +536,11 @@ class SVGD(ImplicitGradient): kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))` temperature: float parameter responsible for exploration, higher temperature gives more broad posterior estimate - start: `dict` + start: `dict[str, np.ndarray]` or `StartDict` initial point for inference random_seed: None or int leave None to use package global RandomStream or other valid value to create instance specific one - start: `Point` - starting point for inference kwargs: other keyword arguments passed to estimator References @@ -631,7 +631,11 @@ def __init__(self, approx=None, estimator=KSD, kernel=test_functions.rbf, **kwar "is often **underestimated** when using temperature = 1." ) if approx is None: - approx = FullRank(model=kwargs.pop("model", None)) + approx = FullRank( + model=kwargs.pop("model", None), + random_seed=kwargs.pop("random_seed", None), + start=kwargs.pop("start", None), + ) super().__init__(estimator=estimator, approx=approx, kernel=kernel, **kwargs) def fit( From 9a894ce8575b6c63aea262a8f2d5266c5d782c97 Mon Sep 17 00:00:00 2001 From: Markus Schmaus Date: Wed, 7 Sep 2022 01:05:14 +0200 Subject: [PATCH 3/3] inline _prepare_start_sigma and use expm1 --- pymc/variational/approximations.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py index b8c9135259..cace8f03b7 100644 --- a/pymc/variational/approximations.py +++ b/pymc/variational/approximations.py @@ -79,20 +79,19 @@ def create_shared_params(self, start=None, start_sigma=None): # by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or # future code changes that break this assumption. start = self._prepare_start(start) - rho = self._prepare_start_sigma(start_sigma) - return { - "mu": aesara.shared(pm.floatX(start), "mu"), - "rho": aesara.shared(pm.floatX(rho), "rho"), - } + rho1 = np.zeros((self.ddim,)) - def _prepare_start_sigma(self, start_sigma): - rho = np.zeros((self.ddim,)) if start_sigma is not None: for name, slice_, *_ in self.ordering.values(): sigma = start_sigma.get(name) if sigma is not None: - rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0) - return rho + rho1[slice_] = np.log(np.expm1(np.abs(sigma))) + rho = rho1 + + return { + "mu": aesara.shared(pm.floatX(start), "mu"), + "rho": aesara.shared(pm.floatX(rho), "rho"), + } @node_property def symbolic_random(self):