From e9d7757f83d4af6bbd061bf2d93c13360d65b8d6 Mon Sep 17 00:00:00 2001
From: Markus Schmaus <markus.schmaus.math@gmail.com>
Date: Fri, 2 Sep 2022 20:02:14 +0200
Subject: [PATCH 1/3] Add `start_sigma` to ADVI

---
 pymc/variational/approximations.py | 22 +++++++++++++++++++---
 pymc/variational/inference.py      | 13 +++++++++++--
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py
index c0b9c7e3d6..9897232e75 100644
--- a/pymc/variational/approximations.py
+++ b/pymc/variational/approximations.py
@@ -67,17 +67,33 @@ def std(self):
     def __init_group__(self, group):
         super().__init_group__(group)
         if not self._check_user_params():
-            self.shared_params = self.create_shared_params(self._kwargs.get("start", None))
+            self.shared_params = self.create_shared_params(
+                self._kwargs.get("start", None), self._kwargs.get("start_sigma", None)
+            )
         self._finalize_init()
 
-    def create_shared_params(self, start=None):
+    def create_shared_params(self, start=None, start_sigma=None):
+        # NOTE: `Group._prepare_start` uses `self.model.free_RVs` to identify free variables and
+        # `DictToArrayBijection` to turn them into a flat array, while `Approximation.rslice` assumes that the free
+        # variables are given by `self.group` and that the mapping between original variables and flat array is given
+        # by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or
+        # future code changes that break this assumption.
         start = self._prepare_start(start)
-        rho = np.zeros((self.ddim,))
+        rho = self._prepare_start_sigma(start_sigma)
         return {
             "mu": aesara.shared(pm.floatX(start), "mu"),
             "rho": aesara.shared(pm.floatX(rho), "rho"),
         }
 
+    def _prepare_start_sigma(self, start_sigma):
+        rho = np.zeros((self.ddim,))
+        if start_sigma is not None:
+            for name, slice_, *_ in self.ordering.items():
+                sigma = start_sigma.get(name)
+                if sigma is not None:
+                    rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0)
+        return rho
+
     @node_property
     def symbolic_random(self):
         initial = self.symbolic_initial
diff --git a/pymc/variational/inference.py b/pymc/variational/inference.py
index 88b5e7b744..4d8f0d49de 100644
--- a/pymc/variational/inference.py
+++ b/pymc/variational/inference.py
@@ -433,8 +433,10 @@ class ADVI(KLqp):
     random_seed: None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start: `Point`
+    start: `dict[str, np.ndarray]` or `StartDict`
         starting point for inference
+    start_sigma: `dict[str, np.ndarray]`
+        starting standard deviation for inference, only available for method 'advi'
 
     References
     ----------
@@ -660,6 +662,7 @@ def fit(
     model=None,
     random_seed=None,
     start=None,
+    start_sigma=None,
     inf_kwargs=None,
     **kwargs,
 ):
@@ -684,8 +687,10 @@ def fit(
         valid value to create instance specific one
     inf_kwargs: dict
         additional kwargs passed to :class:`Inference`
-    start: `Point`
+    start: `dict[str, np.ndarray]` or `StartDict`
         starting point for inference
+    start_sigma: `dict[str, np.ndarray]`
+        starting standard deviation for inference, only available for method 'advi'
 
     Other Parameters
     ----------------
@@ -728,6 +733,10 @@ def fit(
         inf_kwargs["random_seed"] = random_seed
     if start is not None:
         inf_kwargs["start"] = start
+    if start_sigma is not None:
+        if method != "advi":
+            raise NotImplementedError("start_sigma is only available for method advi")
+        inf_kwargs["start_sigma"] = start_sigma
     if model is None:
         model = pm.modelcontext(model)
     _select = dict(advi=ADVI, fullrank_advi=FullRankADVI, svgd=SVGD, asvgd=ASVGD)

From 1bd4fda2e70975f82fa0d7319b0dc3f42e6826f5 Mon Sep 17 00:00:00 2001
From: Markus Schmaus <markus.schmaus.math@gmail.com>
Date: Sat, 3 Sep 2022 13:07:15 +0200
Subject: [PATCH 2/3] add test for `start` and `start_sigma` plus minor fixes

---
 pymc/tests/test_variational_inference.py | 22 ++++++++++++++++++++++
 pymc/variational/approximations.py       |  2 +-
 pymc/variational/inference.py            | 16 ++++++++++------
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/pymc/tests/test_variational_inference.py b/pymc/tests/test_variational_inference.py
index c5b8a80cf4..7dbed2ab48 100644
--- a/pymc/tests/test_variational_inference.py
+++ b/pymc/tests/test_variational_inference.py
@@ -571,6 +571,28 @@ def test_fit_oo(inference, fit_kwargs, simple_model_data):
     np.testing.assert_allclose(np.std(trace.posterior["mu"]), np.sqrt(1.0 / d), rtol=0.2)
 
 
+def test_fit_start(inference_spec, simple_model):
+    mu_init = 17
+    mu_sigma_init = 13
+
+    with simple_model:
+        if type(inference_spec()) == ADVI:
+            has_start_sigma = True
+        else:
+            has_start_sigma = False
+
+    kw = {"start": {"mu": mu_init}}
+    if has_start_sigma:
+        kw.update({"start_sigma": {"mu": mu_sigma_init}})
+
+    with simple_model:
+        inference = inference_spec(**kw)
+    trace = inference.fit(n=0).sample(10000)
+    np.testing.assert_allclose(np.mean(trace.posterior["mu"]), mu_init, rtol=0.05)
+    if has_start_sigma:
+        np.testing.assert_allclose(np.std(trace.posterior["mu"]), mu_sigma_init, rtol=0.05)
+
+
 def test_profile(inference):
     inference.run_profiling(n=100).summary()
 
diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py
index 9897232e75..b8c9135259 100644
--- a/pymc/variational/approximations.py
+++ b/pymc/variational/approximations.py
@@ -88,7 +88,7 @@ def create_shared_params(self, start=None, start_sigma=None):
     def _prepare_start_sigma(self, start_sigma):
         rho = np.zeros((self.ddim,))
         if start_sigma is not None:
-            for name, slice_, *_ in self.ordering.items():
+            for name, slice_, *_ in self.ordering.values():
                 sigma = start_sigma.get(name)
                 if sigma is not None:
                     rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0)
diff --git a/pymc/variational/inference.py b/pymc/variational/inference.py
index 4d8f0d49de..e932df1e57 100644
--- a/pymc/variational/inference.py
+++ b/pymc/variational/inference.py
@@ -257,7 +257,9 @@ def _infmean(input_array):
                     )
                 )
         else:
-            if n < 10:
+            if n == 0:
+                logger.info(f"Initialization only")
+            elif n < 10:
                 logger.info(f"Finished [100%]: Loss = {scores[-1]:,.5g}")
             else:
                 avg_loss = _infmean(scores[max(0, i - 1000) : i + 1])
@@ -466,7 +468,7 @@ class FullRankADVI(KLqp):
     random_seed: None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start: `Point`
+    start: `dict[str, np.ndarray]` or `StartDict`
         starting point for inference
 
     References
@@ -534,13 +536,11 @@ class SVGD(ImplicitGradient):
         kernel function for KSD :math:`f(histogram) -> (k(x,.), \nabla_x k(x,.))`
     temperature: float
         parameter responsible for exploration, higher temperature gives more broad posterior estimate
-    start: `dict`
+    start: `dict[str, np.ndarray]` or `StartDict`
         initial point for inference
     random_seed: None or int
         leave None to use package global RandomStream or other
         valid value to create instance specific one
-    start: `Point`
-        starting point for inference
     kwargs: other keyword arguments passed to estimator
 
     References
@@ -631,7 +631,11 @@ def __init__(self, approx=None, estimator=KSD, kernel=test_functions.rbf, **kwar
             "is often **underestimated** when using temperature = 1."
         )
         if approx is None:
-            approx = FullRank(model=kwargs.pop("model", None))
+            approx = FullRank(
+                model=kwargs.pop("model", None),
+                random_seed=kwargs.pop("random_seed", None),
+                start=kwargs.pop("start", None),
+            )
         super().__init__(estimator=estimator, approx=approx, kernel=kernel, **kwargs)
 
     def fit(

From 9a894ce8575b6c63aea262a8f2d5266c5d782c97 Mon Sep 17 00:00:00 2001
From: Markus Schmaus <markus.schmaus.math@gmail.com>
Date: Wed, 7 Sep 2022 01:05:14 +0200
Subject: [PATCH 3/3] inline _prepare_start_sigma and use expm1

---
 pymc/variational/approximations.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/pymc/variational/approximations.py b/pymc/variational/approximations.py
index b8c9135259..cace8f03b7 100644
--- a/pymc/variational/approximations.py
+++ b/pymc/variational/approximations.py
@@ -79,20 +79,19 @@ def create_shared_params(self, start=None, start_sigma=None):
         # by `self.ordering`. In the cases I looked into these turn out to be the same, but there may be edge cases or
         # future code changes that break this assumption.
         start = self._prepare_start(start)
-        rho = self._prepare_start_sigma(start_sigma)
-        return {
-            "mu": aesara.shared(pm.floatX(start), "mu"),
-            "rho": aesara.shared(pm.floatX(rho), "rho"),
-        }
+        rho1 = np.zeros((self.ddim,))
 
-    def _prepare_start_sigma(self, start_sigma):
-        rho = np.zeros((self.ddim,))
         if start_sigma is not None:
             for name, slice_, *_ in self.ordering.values():
                 sigma = start_sigma.get(name)
                 if sigma is not None:
-                    rho[slice_] = np.log(np.exp(np.abs(sigma)) - 1.0)
-        return rho
+                    rho1[slice_] = np.log(np.expm1(np.abs(sigma)))
+        rho = rho1
+
+        return {
+            "mu": aesara.shared(pm.floatX(start), "mu"),
+            "rho": aesara.shared(pm.floatX(rho), "rho"),
+        }
 
     @node_property
     def symbolic_random(self):