From 48841de77bb545c8dc1f4b4dd40a1267275f5790 Mon Sep 17 00:00:00 2001
From: Sait Cakmak <saitcakmak@meta.com>
Date: Tue, 14 Oct 2025 15:27:02 -0700
Subject: [PATCH] Clean up some leftover utils from legacy generators

Summary:
Removes a bunch of model fitting & acqf construction utils that were used in legacy generators. These are not used anywhere since the legacy generator has been removed.

Also removed `botorch_moo_defaults` to `botorch_moo_utils` to better communicate its purpose.

Reviewed By: dme65

Differential Revision: D84653561
---
 ax/adapter/adapter_utils.py                   |   2 +-
 ax/adapter/tests/test_torch_moo_adapter.py    |   2 +-
 ax/adapter/torch.py                           |   2 +-
 ax/generators/tests/test_botorch_defaults.py  | 564 -------------
 ..._defaults.py => test_botorch_moo_utils.py} |  97 +--
 ax/generators/tests/test_torch_model_utils.py |  29 +-
 ax/generators/torch/botorch_defaults.py       | 746 -----------------
 .../torch/botorch_modular/acquisition.py      |   2 +-
 ax/generators/torch/botorch_moo_defaults.py   | 775 ------------------
 ax/generators/torch/botorch_moo_utils.py      | 306 +++++++
 ax/generators/torch/tests/test_acquisition.py |   2 +-
 ax/generators/torch/utils.py                  |  74 +-
 12 files changed, 321 insertions(+), 2280 deletions(-)
 delete mode 100644 ax/generators/tests/test_botorch_defaults.py
 rename ax/generators/tests/{test_botorch_moo_defaults.py => test_botorch_moo_utils.py} (78%)
 delete mode 100644 ax/generators/torch/botorch_defaults.py
 delete mode 100644 ax/generators/torch/botorch_moo_defaults.py
 create mode 100644 ax/generators/torch/botorch_moo_utils.py

diff --git a/ax/adapter/adapter_utils.py b/ax/adapter/adapter_utils.py
index 276ab8c9fa3..258737d06d2 100644
--- a/ax/adapter/adapter_utils.py
+++ b/ax/adapter/adapter_utils.py
@@ -48,7 +48,7 @@
 )
 from ax.core.types import TBounds, TCandidateMetadata
 from ax.exceptions.core import DataRequiredError, UserInputError
-from ax.generators.torch.botorch_moo_defaults import (
+from ax.generators.torch.botorch_moo_utils import (
     get_weighted_mc_objective_and_objective_thresholds,
     pareto_frontier_evaluator,
 )
diff --git a/ax/adapter/tests/test_torch_moo_adapter.py b/ax/adapter/tests/test_torch_moo_adapter.py
index 34d92ec7050..0cb72a03766 100644
--- a/ax/adapter/tests/test_torch_moo_adapter.py
+++ b/ax/adapter/tests/test_torch_moo_adapter.py
@@ -32,7 +32,7 @@
 )
 from ax.core.parameter_constraint import ParameterConstraint
 from ax.generators.torch.botorch_modular.generator import BoTorchGenerator
-from ax.generators.torch.botorch_moo_defaults import (
+from ax.generators.torch.botorch_moo_utils import (
     infer_objective_thresholds,
     pareto_frontier_evaluator,
 )
diff --git a/ax/adapter/torch.py b/ax/adapter/torch.py
index 86731a99bdf..3e280fc6398 100644
--- a/ax/adapter/torch.py
+++ b/ax/adapter/torch.py
@@ -64,7 +64,7 @@
 from ax.exceptions.core import DataRequiredError, UnsupportedError, UserInputError
 from ax.exceptions.generation_strategy import OptimizationConfigRequired
 from ax.generators.torch.botorch_modular.generator import BoTorchGenerator
-from ax.generators.torch.botorch_moo_defaults import infer_objective_thresholds
+from ax.generators.torch.botorch_moo_utils import infer_objective_thresholds
 from ax.generators.torch.utils import _get_X_pending_and_observed
 from ax.generators.torch_base import TorchGenerator, TorchOptConfig
 from ax.generators.types import TConfig
diff --git a/ax/generators/tests/test_botorch_defaults.py b/ax/generators/tests/test_botorch_defaults.py
deleted file mode 100644
index 7c0ae294256..00000000000
--- a/ax/generators/tests/test_botorch_defaults.py
+++ /dev/null
@@ -1,564 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import math
-from copy import deepcopy
-from unittest import mock
-from unittest.mock import Mock
-
-import torch
-from ax.generators.torch.botorch_defaults import (
-    _get_acquisition_func,
-    _get_customized_covar_module,
-    _get_model,
-    get_and_fit_model,
-    get_warping_transform,
-    NO_OBSERVED_POINTS_MESSAGE,
-)
-from ax.utils.common.testutils import TestCase
-from ax.utils.testing.mock import mock_botorch_optimize
-from botorch.acquisition.logei import (
-    qLogExpectedImprovement,
-    qLogNoisyExpectedImprovement,
-)
-from botorch.acquisition.monte_carlo import (
-    qExpectedImprovement,
-    qNoisyExpectedImprovement,
-    qProbabilityOfImprovement,
-    qSimpleRegret,
-)
-from botorch.acquisition.objective import ConstrainedMCObjective
-from botorch.acquisition.penalized import L1PenaltyObjective, PenalizedMCObjective
-from botorch.exceptions.errors import UnsupportedError
-from botorch.models.gp_regression import SingleTaskGP
-from botorch.models.gp_regression_fidelity import SingleTaskMultiFidelityGP
-from botorch.models.multitask import MultiTaskGP
-from botorch.models.transforms.input import Warp
-from botorch.utils.constraints import get_outcome_constraint_transforms
-from gpytorch.kernels import MaternKernel, ScaleKernel
-from gpytorch.likelihoods.gaussian_likelihood import (
-    FixedNoiseGaussianLikelihood,
-    GaussianLikelihood,
-)
-from gpytorch.likelihoods.hadamard_gaussian_likelihood import HadamardGaussianLikelihood
-from gpytorch.module import Module
-from gpytorch.priors import GammaPrior
-from gpytorch.priors.lkj_prior import LKJCovariancePrior
-from gpytorch.priors.prior import Prior
-from pyre_extensions import assert_is_instance, none_throws
-
-
-class BotorchDefaultsTest(TestCase):
-    def test_get_model(self) -> None:
-        x = torch.rand(2, 2)
-        y = torch.rand(2, 1)
-        var = torch.rand(2, 1)
-        partial_var = torch.tensor([0, float("nan")]).unsqueeze(-1)
-        unknown_var = torch.tensor([float("nan"), float("nan")]).unsqueeze(-1)
-        model = _get_model(x, y, unknown_var, None)
-        self.assertIsInstance(model, SingleTaskGP)
-        self.assertIsInstance(model.likelihood, GaussianLikelihood)
-
-        model = _get_model(X=x, Y=y, Yvar=var)
-        self.assertIsInstance(model, SingleTaskGP)
-        self.assertIsInstance(model.likelihood, FixedNoiseGaussianLikelihood)
-        self.assertEqual(
-            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-            #  `lengthscale_prior`.
-            model.covar_module.lengthscale_prior.loc,
-            math.log(2.0) / 2 + 2**0.5,
-        )
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `lengthscale_prior`.
-        self.assertEqual(model.covar_module.lengthscale_prior.scale, 3**0.5)
-        model = _get_model(X=x, Y=y, Yvar=unknown_var, task_feature=1)
-        self.assertIs(type(model), MultiTaskGP)  # Don't accept subclasses.
-        self.assertIsInstance(model.likelihood, HadamardGaussianLikelihood)
-        model = _get_model(X=x, Y=y, Yvar=var, task_feature=1)
-        self.assertIsInstance(model, MultiTaskGP)
-        self.assertIsInstance(model.likelihood, FixedNoiseGaussianLikelihood)
-        model = _get_model(X=x, Y=y, Yvar=partial_var.clone(), task_feature=1)
-        self.assertIsInstance(model, MultiTaskGP)
-        model = _get_model(X=x, Y=y, Yvar=partial_var.clone(), task_feature=1, rank=1)
-        self.assertEqual(model._rank, 1)
-        with self.assertRaises(ValueError):
-            model = _get_model(X=x, Y=y, Yvar=partial_var, task_feature=None)
-        model = _get_model(X=x, Y=y, Yvar=var, fidelity_features=[-1])
-        self.assertTrue(isinstance(model, SingleTaskMultiFidelityGP))
-        with self.assertRaises(NotImplementedError):
-            _get_model(X=x, Y=y, Yvar=var, task_feature=1, fidelity_features=[-1])
-        # test fixed prior
-        prior = {
-            "type": LKJCovariancePrior,
-            "sd_prior": GammaPrior(2.0, 0.44),
-            "eta": 0.6,
-        }
-        x[0, 1] = 0
-        x[1, 1] = 1
-        model = _get_model(
-            X=x, Y=y, Yvar=partial_var.clone(), task_feature=1, prior=prior
-        )
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `kernels`.
-        task_covar_module = model.covar_module.kernels[1]
-        self.assertIsInstance(
-            task_covar_module.IndexKernelPrior,
-            LKJCovariancePrior,
-        )
-        self.assertEqual(
-            task_covar_module.IndexKernelPrior.sd_prior.concentration,
-            2.0,
-        )
-        self.assertEqual(task_covar_module.IndexKernelPrior.sd_prior.rate, 0.44)
-        self.assertEqual(
-            task_covar_module.IndexKernelPrior.correlation_prior.eta,
-            0.6,
-        )
-        model = _get_model(
-            X=x,
-            Y=y,
-            Yvar=partial_var.clone(),
-            task_feature=1,
-            prior={"type": LKJCovariancePrior},
-        )
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `kernels`.
-        task_covar_module = model.covar_module.kernels[1]
-        self.assertIsInstance(
-            task_covar_module.IndexKernelPrior,
-            LKJCovariancePrior,
-        )
-        self.assertEqual(
-            task_covar_module.IndexKernelPrior.sd_prior.concentration,
-            1.0,
-        )
-        self.assertEqual(task_covar_module.IndexKernelPrior.sd_prior.rate, 0.15)
-        self.assertEqual(
-            task_covar_module.IndexKernelPrior.correlation_prior.eta,
-            0.5,
-        )
-        prior = {
-            "type": LKJCovariancePrior,
-            "sd_prior": GammaPrior(2.0, 0.44),
-            "eta": "hi",
-        }
-        with self.assertRaises(ValueError):
-            _get_model(X=x, Y=y, Yvar=partial_var.clone(), task_feature=1, prior=prior)
-
-        prior = {"type": Prior, "sd_prior": GammaPrior(2.0, 0.44), "eta": 0.5}
-        with self.assertRaises(NotImplementedError):
-            _get_model(X=x, Y=y, Yvar=partial_var.clone(), task_feature=1, prior=prior)
-        # test passing customized prior
-        prior = {
-            "covar_module_prior": {"lengthscale_prior": GammaPrior(12.0, 2.0)},
-            "type": LKJCovariancePrior,
-        }
-        model = _get_model(X=x, Y=y, Yvar=var, prior=deepcopy(prior))
-        self.assertIsInstance(model, SingleTaskGP)
-        self.assertEqual(
-            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-            #  `base_kernel`.
-            model.covar_module.base_kernel.lengthscale_prior.concentration,
-            12.0,
-        )
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `base_kernel`.
-        self.assertEqual(model.covar_module.base_kernel.lengthscale_prior.rate, 2.0)
-        model = _get_model(
-            X=x,
-            Y=y,
-            Yvar=unknown_var,
-            task_feature=1,
-            prior=deepcopy(prior),
-        )
-        self.assertIs(type(model), MultiTaskGP)
-        self.assertIsInstance(model.likelihood, HadamardGaussianLikelihood)
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `kernels`.
-        data_covar_module, task_covar_module = model.covar_module.kernels
-        self.assertEqual(
-            data_covar_module.base_kernel.lengthscale_prior.concentration,
-            12.0,
-        )
-        self.assertEqual(data_covar_module.base_kernel.lengthscale_prior.rate, 2.0)
-        self.assertIsInstance(
-            task_covar_module.IndexKernelPrior,
-            LKJCovariancePrior,
-        )
-        model = _get_model(
-            X=x,
-            Y=y,
-            Yvar=var,
-            task_feature=1,
-            prior=deepcopy(prior),
-        )
-        self.assertIsInstance(model, MultiTaskGP)
-        self.assertIsInstance(model.likelihood, FixedNoiseGaussianLikelihood)
-        self.assertEqual(
-            data_covar_module.base_kernel.lengthscale_prior.concentration,
-            12.0,
-        )
-        self.assertEqual(data_covar_module.base_kernel.lengthscale_prior.rate, 2.0)
-        self.assertIsInstance(
-            task_covar_module.IndexKernelPrior,
-            LKJCovariancePrior,
-        )
-        # test passing customized prior
-        prior = {
-            "covar_module_prior": {"lengthscale_prior": GammaPrior(12.0, 2.0)},
-        }
-        covar_module = MaternKernel(
-            nu=2.5,
-            ard_num_dims=2,
-            lengthscale_prior=GammaPrior(6.0, 6.0),
-        )
-        model = _get_model(
-            X=x,
-            Y=y,
-            Yvar=var,
-            covar_module=covar_module,
-            prior=prior,
-        )
-        self.assertIsInstance(model, SingleTaskGP)
-        self.assertIsInstance(model.likelihood, FixedNoiseGaussianLikelihood)
-        self.assertEqual(covar_module, model.covar_module)
-
-        # test input warping dimension checks.
-        with self.assertRaisesRegex(UnsupportedError, "batched multi output models"):
-            _get_model(
-                X=torch.ones(4, 3, 2),
-                Y=torch.ones(4, 3, 2),
-                Yvar=torch.zeros(4, 3, 2),
-                use_input_warping=True,
-            )
-
-    @mock.patch("ax.generators.torch.botorch_defaults._get_model", wraps=_get_model)
-    @mock_botorch_optimize
-    def test_task_feature(self, get_model_mock: Mock) -> None:
-        x = [torch.zeros(2, 2)]
-        y = [torch.zeros(2, 1)]
-        yvars = [torch.ones(2, 1)]
-        get_and_fit_model(
-            Xs=x,
-            Ys=y,
-            Yvars=yvars,
-            task_features=[1],
-            fidelity_features=[],
-            metric_signatures=["L2NormMetric"],
-            state_dict=None,
-            refit_model=False,
-        )
-        # Check that task feature was correctly passed to _get_model
-        self.assertEqual(get_model_mock.mock_calls[0][2]["task_feature"], 1)
-
-        # check error on multiple task features
-        with self.assertRaises(NotImplementedError):
-            get_and_fit_model(
-                Xs=x,
-                Ys=y,
-                Yvars=yvars,
-                task_features=[0, 1],
-                fidelity_features=[],
-                metric_signatures=["L2NormMetric"],
-                state_dict=None,
-                refit_model=False,
-            )
-
-        # check error on multiple fidelity features
-        with self.assertRaises(NotImplementedError):
-            get_and_fit_model(
-                Xs=x,
-                Ys=y,
-                Yvars=yvars,
-                task_features=[],
-                fidelity_features=[-1, -2],
-                metric_signatures=["L2NormMetric"],
-                state_dict=None,
-                refit_model=False,
-            )
-
-        # check error on botch task and fidelity feature
-        with self.assertRaises(NotImplementedError):
-            get_and_fit_model(
-                Xs=x,
-                Ys=y,
-                Yvars=yvars,
-                task_features=[1],
-                fidelity_features=[-1],
-                metric_signatures=["L2NormMetric"],
-                state_dict=None,
-                refit_model=False,
-            )
-
-    @mock_botorch_optimize
-    def test_pass_customized_prior(self) -> None:
-        x = [torch.zeros(2, 2)]
-        y = [torch.zeros(2, 1)]
-        yvars = [torch.ones(2, 1)]
-        prior = {
-            "covar_module_prior": {
-                "lengthscale_prior": GammaPrior(12.0, 2.0),
-                "outputscale_prior": GammaPrior(2.0, 12.0),
-            },
-        }
-        model = get_and_fit_model(
-            Xs=x,
-            Ys=y,
-            Yvars=yvars,
-            task_features=[],
-            fidelity_features=[],
-            metric_signatures=["L2NormMetric"],
-            state_dict=None,
-            refit_model=False,
-            prior=prior,
-        )
-        self.assertIsInstance(model, SingleTaskGP)
-        self.assertIsInstance(model.likelihood, FixedNoiseGaussianLikelihood)
-
-        self.assertEqual(
-            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-            #  `base_kernel`.
-            model.covar_module.base_kernel.lengthscale_prior.concentration,
-            12.0,
-        )
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `base_kernel`.
-        self.assertEqual(model.covar_module.base_kernel.lengthscale_prior.rate, 2.0)
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `outputscale_prior`.
-        self.assertEqual(model.covar_module.outputscale_prior.concentration, 2.0)
-        # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-        #  `outputscale_prior`.
-        self.assertEqual(model.covar_module.outputscale_prior.rate, 12.0)
-
-        model = get_and_fit_model(
-            Xs=x + x,
-            Ys=y + y,
-            Yvars=yvars + yvars,
-            task_features=[1],
-            fidelity_features=[],
-            metric_signatures=["L2NormMetric", "L2NormMetric2"],
-            state_dict=None,
-            refit_model=False,
-            prior=prior,
-        )
-        # pyre-fixme[29]: `Union[(self: Tensor) -> Any, Tensor, Module]` is not a
-        #  function.
-        for m in model.models:
-            self.assertIs(type(m), MultiTaskGP)
-            data_covar_module, task_covar_module = m.covar_module.kernels
-            self.assertIsInstance(m.likelihood, FixedNoiseGaussianLikelihood)
-            self.assertEqual(
-                data_covar_module.base_kernel.lengthscale_prior.concentration,
-                12.0,
-            )
-            self.assertEqual(data_covar_module.base_kernel.lengthscale_prior.rate, 2.0)
-            self.assertEqual(data_covar_module.outputscale_prior.concentration, 2.0)
-            self.assertEqual(data_covar_module.outputscale_prior.rate, 12.0)
-
-    def test_get_acquisition_func(self) -> None:
-        d, m = 3, 2
-        n = 16
-        x = torch.randn(n, d)
-        y = torch.randn(n, m)
-        unknown_var = torch.tensor([float("nan"), float("nan")]).unsqueeze(-1)
-        model = _get_model(x, y, unknown_var, None)
-        objective_weights = torch.tensor([1.0, 0.0])  # first output is objective
-        outcome_constraints = (
-            torch.tensor([[0.0, 1.0], [0.0, -1.0], [1.0, 1.0]]),  # k x m
-            torch.tensor([[1.0], [-1.0], [0.0]]),  # k x 1
-        )
-        X_observed = torch.zeros(2, d)
-        expected_constraints = none_throws(
-            get_outcome_constraint_transforms(outcome_constraints)
-        )
-        samples = torch.zeros(n, m)  # to test constraints
-
-        for acqf_name, acqf_class in zip(
-            ["qEI", "qLogEI", "qPI", "qNEI", "qLogNEI"],
-            [
-                qExpectedImprovement,
-                qLogExpectedImprovement,
-                qProbabilityOfImprovement,
-                qNoisyExpectedImprovement,
-                qLogNoisyExpectedImprovement,
-            ],
-        ):
-            acqf = _get_acquisition_func(
-                model=model,
-                acquisition_function_name=acqf_name,
-                objective_weights=objective_weights,
-                outcome_constraints=outcome_constraints,
-                X_observed=X_observed,
-                # SampleReducingMCAcquisitionFunctions don't need this objective
-                constrained_mc_objective=None,
-            )
-            self.assertIsInstance(acqf, acqf_class)
-            acqf_constraints = acqf._constraints
-            self.assertIsNotNone(acqf_constraints)
-
-            # while the function pointer is different, return value has to be the same
-            # pyre-fixme[6]: For 1st argument expected `Iterable[_T1]` but got
-            #  `Union[Tensor, Module]`.
-            for acqf_con, exp_con in zip(acqf_constraints, expected_constraints):
-                self.assertAllClose(acqf_con(samples), exp_con(samples))
-
-            with self.assertRaisesRegex(ValueError, NO_OBSERVED_POINTS_MESSAGE):
-                _get_acquisition_func(
-                    model=model,
-                    acquisition_function_name=acqf_name,
-                    objective_weights=objective_weights,
-                    outcome_constraints=outcome_constraints,
-                    X_observed=None,  # errors because of no observations
-                )
-
-        # test support for PenalizedMCObjective
-        penalty_objective = L1PenaltyObjective(init_point=torch.zeros(1, d))
-        for acqf_name, acqf_class in zip(
-            ["qEI", "qLogEI", "qNEI", "qLogNEI"],
-            [
-                qExpectedImprovement,
-                qLogExpectedImprovement,
-                qNoisyExpectedImprovement,
-                qLogNoisyExpectedImprovement,
-            ],
-        ):
-            acqf = _get_acquisition_func(
-                model=model,
-                acquisition_function_name=acqf_name,
-                objective_weights=objective_weights,
-                outcome_constraints=outcome_constraints,
-                X_observed=X_observed,
-                mc_objective=PenalizedMCObjective,
-                constrained_mc_objective=None,
-                mc_objective_kwargs={
-                    "penalty_objective": penalty_objective,
-                    "regularization_parameter": 0.1,
-                },
-            )
-            self.assertIsInstance(acqf, acqf_class)
-            acqf_constraints = acqf._constraints
-            self.assertIsNotNone(acqf_constraints)
-            self.assertIsInstance(acqf.objective, PenalizedMCObjective)
-            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-            #  `penalty_objective`.
-            self.assertIsInstance(acqf.objective.penalty_objective, L1PenaltyObjective)
-            # pyre-fixme[16]: Item `Tensor` of `Tensor | Module` has no attribute
-            #  `regularization_parameter`.
-            self.assertEqual(acqf.objective.regularization_parameter, 0.1)
-
-        acqf_name = "qSR"
-        acqf_class = qSimpleRegret
-        acqf = _get_acquisition_func(
-            model=model,
-            acquisition_function_name=acqf_name,
-            objective_weights=objective_weights,
-            outcome_constraints=outcome_constraints,
-            X_observed=X_observed,
-            # these two need the legacy constrained objective
-            constrained_mc_objective=ConstrainedMCObjective,
-        )
-        self.assertIsInstance(acqf, acqf_class)
-        acqf_constraints = acqf._constraints
-        self.assertIsNone(acqf_constraints)  # because this uses the legacy path
-        self.assertIsInstance(acqf.objective, ConstrainedMCObjective)
-
-        # the following two errors are only thrown when the acquisition function is
-        # not a SampleReducingMCAcquisitionFunction.
-        with self.assertRaisesRegex(
-            ValueError,
-            "constrained_mc_objective cannot be set to None "
-            "when applying outcome constraints.",
-        ):
-            _get_acquisition_func(
-                model=model,
-                acquisition_function_name=acqf_name,
-                objective_weights=objective_weights,
-                outcome_constraints=outcome_constraints,
-                X_observed=X_observed,
-                constrained_mc_objective=None,
-            )
-
-        # these are not yet supported, will require passing additional arguments to
-        # the botorch constructor (i.e. beta for UCB, ref_point and Yfor EHVI.)
-        for acqf_name in ["qUCB", "qEHVI", "qNEHVI"]:
-            with self.assertRaisesRegex(NotImplementedError, "not implemented yet"):
-                _get_acquisition_func(
-                    model=model,
-                    acquisition_function_name=acqf_name,
-                    objective_weights=objective_weights,
-                    outcome_constraints=outcome_constraints,
-                    X_observed=X_observed,
-                )
-
-    def test_get_customized_covar_module(self) -> None:
-        ard_num_dims = 3
-        batch_shape = torch.Size([2])
-        covar_module = _get_customized_covar_module(
-            covar_module_prior_dict={},
-            ard_num_dims=ard_num_dims,
-            aug_batch_shape=batch_shape,
-            task_feature=None,
-        )
-        self.assertIsInstance(covar_module, Module)
-        self.assertIsInstance(covar_module, ScaleKernel)
-        self.assertIsInstance(covar_module.outputscale_prior, GammaPrior)
-        prior = assert_is_instance(covar_module.outputscale_prior, GammaPrior)
-        self.assertEqual(prior.concentration, 2.0)
-        self.assertEqual(prior.rate, 0.15)
-        self.assertIsInstance(covar_module.base_kernel, MaternKernel)
-        base_kernel = assert_is_instance(covar_module.base_kernel, MaternKernel)
-        self.assertIsInstance(base_kernel.lengthscale_prior, GammaPrior)
-        self.assertEqual(
-            assert_is_instance(base_kernel.lengthscale_prior, GammaPrior).concentration,
-            3.0,
-        )
-        self.assertEqual(
-            assert_is_instance(base_kernel.lengthscale_prior, GammaPrior).rate, 6.0
-        )
-        self.assertEqual(base_kernel.ard_num_dims, ard_num_dims)
-        self.assertEqual(base_kernel.batch_shape, batch_shape)
-
-        covar_module = _get_customized_covar_module(
-            covar_module_prior_dict={
-                "lengthscale_prior": GammaPrior(12.0, 2.0),
-                "outputscale_prior": GammaPrior(2.0, 12.0),
-            },
-            ard_num_dims=ard_num_dims,
-            aug_batch_shape=batch_shape,
-            task_feature=3,
-        )
-        self.assertIsInstance(covar_module, Module)
-        self.assertIsInstance(covar_module, ScaleKernel)
-        self.assertIsInstance(covar_module.outputscale_prior, GammaPrior)
-        prior = assert_is_instance(covar_module.outputscale_prior, GammaPrior)
-        self.assertEqual(prior.concentration, 2.0)
-        self.assertEqual(prior.rate, 12.0)
-        self.assertIsInstance(covar_module.base_kernel, MaternKernel)
-        base_kernel = assert_is_instance(covar_module.base_kernel, MaternKernel)
-        self.assertIsInstance(base_kernel.lengthscale_prior, GammaPrior)
-        self.assertEqual(
-            assert_is_instance(base_kernel.lengthscale_prior, GammaPrior).concentration,
-            12.0,
-        )
-        self.assertEqual(
-            assert_is_instance(base_kernel.lengthscale_prior, GammaPrior).rate, 2.0
-        )
-        self.assertEqual(base_kernel.ard_num_dims, ard_num_dims - 1)
-        self.assertEqual(base_kernel.batch_shape, batch_shape)
-
-    def test_get_warping_transform(self) -> None:
-        warp_tf = get_warping_transform(d=4)
-        self.assertIsInstance(warp_tf, Warp)
-        self.assertEqual(warp_tf.indices.tolist(), list(range(4)))
-        warp_tf = get_warping_transform(d=4, task_feature=2)
-        self.assertEqual(warp_tf.indices.tolist(), [0, 1, 3])
-        warp_tf = get_warping_transform(d=4, batch_shape=torch.Size([2]))
-        self.assertIsInstance(warp_tf, Warp)
-        self.assertEqual(warp_tf.indices.tolist(), list(range(4)))
-        self.assertEqual(warp_tf.batch_shape, torch.Size([2]))
diff --git a/ax/generators/tests/test_botorch_moo_defaults.py b/ax/generators/tests/test_botorch_moo_utils.py
similarity index 78%
rename from ax/generators/tests/test_botorch_moo_defaults.py
rename to ax/generators/tests/test_botorch_moo_utils.py
index b94c9ebdee6..6baac4e14f5 100644
--- a/ax/generators/tests/test_botorch_moo_defaults.py
+++ b/ax/generators/tests/test_botorch_moo_utils.py
@@ -7,36 +7,30 @@
 # pyre-strict
 
 from contextlib import ExitStack
-from typing import Any, cast
+from typing import Any
 from unittest import mock
 from warnings import catch_warnings, simplefilter
 
 import numpy as np
 import torch
 from ax.core.search_space import SearchSpaceDigest
-from ax.generators.torch.botorch_defaults import NO_OBSERVED_POINTS_MESSAGE
 from ax.generators.torch.botorch_modular.generator import BoTorchGenerator
-from ax.generators.torch.botorch_moo_defaults import (
-    get_outcome_constraint_transforms,
-    get_qLogEHVI,
-    get_qLogNEHVI,
+from ax.generators.torch.botorch_moo_utils import (
     get_weighted_mc_objective_and_objective_thresholds,
     infer_objective_thresholds,
     pareto_frontier_evaluator,
 )
 from ax.generators.torch_base import TorchGenerator
-from ax.utils.common.random import with_rng_seed
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.mock import mock_botorch_optimize_context_manager
 from botorch.models.gp_regression import SingleTaskGP
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.multi_objective.hypervolume import infer_reference_point
-from botorch.utils.testing import MockModel, MockPosterior
+from botorch.utils.testing import MockPosterior
 from gpytorch.utils.warnings import NumericalWarning
 
 
-MOO_DEFAULTS_PATH: str = "ax.generators.torch.botorch_moo_defaults"
-GET_ACQF_PATH: str = MOO_DEFAULTS_PATH + ".get_acquisition_function"
+MOO_DEFAULTS_PATH: str = "ax.generators.torch.botorch_moo_utils"
 GET_CONSTRAINT_PATH: str = MOO_DEFAULTS_PATH + ".get_outcome_constraint_transforms"
 GET_OBJ_PATH: str = (
     MOO_DEFAULTS_PATH + ".get_weighted_mc_objective_and_objective_thresholds"
@@ -223,20 +217,7 @@ def test_pareto_frontier_evaluator_with_nan(self) -> None:
         self.assertEqual(idx.tolist(), [4])
 
 
-class BotorchMOODefaultsTest(TestCase):
-    def test_get_qLogEHVI_input_validation_errors(self) -> None:
-        weights = torch.ones(2)
-        objective_thresholds = torch.zeros(2)
-        # Note: this is a real BoTorch `Model` with a real `Posterior`, not a
-        # `unittest.mock.Mock`
-        mm = MockModel(posterior=MockPosterior())
-        with self.assertRaisesRegex(ValueError, NO_OBSERVED_POINTS_MESSAGE):
-            get_qLogEHVI(
-                model=mm,
-                objective_weights=weights,
-                objective_thresholds=objective_thresholds,
-            )
-
+class BotorchMOOUtilsTest(TestCase):
     def test_get_weighted_mc_objective_and_objective_thresholds(self) -> None:
         objective_weights = torch.tensor([0.0, 1.0, 0.0, 1.0])
         objective_thresholds = torch.arange(4, dtype=torch.float)
@@ -251,71 +232,9 @@ def test_get_weighted_mc_objective_and_objective_thresholds(self) -> None:
         self.assertEqual(weighted_obj.outcomes.tolist(), [1, 3])
         self.assertTrue(torch.equal(new_obj_thresholds, objective_thresholds[[1, 3]]))
 
-    def test_get_qLogNEHVI_input_validation_errors(self) -> None:
-        weights = torch.ones(2)
-        objective_thresholds = torch.zeros(2)
-        with self.assertRaisesRegex(ValueError, NO_OBSERVED_POINTS_MESSAGE):
-            get_qLogNEHVI(
-                # pyre-fixme[6] In call `get_qLogNEHVI`, for argument `model`,
-                # expected `Model` but got `None`.
-                model=None,
-                objective_weights=weights,
-                objective_thresholds=objective_thresholds,
-            )
-
-    @mock.patch(  # pyre-ignore
-        "ax.generators.torch.botorch_moo_defaults._check_posterior_type",
-        wraps=lambda y: y,
-    )
-    def test_get_qLogEHVI(self, _) -> None:
-        weights = torch.tensor([0.0, 1.0, 1.0])
-        X_observed = torch.rand(4, 3)
-        X_pending = torch.rand(1, 3)
-        constraints = (torch.tensor([1.0, 0.0, 0.0]), torch.tensor([[10.0]]))
-        Y = torch.rand(4, 3)
-        mm = MockModel(MockPosterior(mean=Y))
-        objective_thresholds = torch.arange(3, dtype=torch.float)
-        obj_and_obj_t = get_weighted_mc_objective_and_objective_thresholds(
-            objective_weights=weights,
-            objective_thresholds=objective_thresholds,
-        )
-        (weighted_obj, new_obj_thresholds) = obj_and_obj_t
-        cons_tfs = get_outcome_constraint_transforms(constraints)
-        with with_rng_seed(0):
-            seed = torch.randint(1, 10000, (1,)).item()
-        with ExitStack() as es:
-            mock_get_acqf = es.enter_context(mock.patch(GET_ACQF_PATH))
-            es.enter_context(
-                mock.patch(MOO_DEFAULTS_PATH + ".assert_is_instance", wraps=cast)
-            )
-            es.enter_context(mock.patch(GET_CONSTRAINT_PATH, return_value=cons_tfs))
-            es.enter_context(mock.patch(GET_OBJ_PATH, return_value=obj_and_obj_t))
-            es.enter_context(with_rng_seed(0))
-            get_qLogEHVI(
-                model=mm,
-                objective_weights=weights,
-                outcome_constraints=constraints,
-                objective_thresholds=objective_thresholds,
-                X_observed=X_observed,
-                X_pending=X_pending,
-            )
-            mock_get_acqf.assert_called_once_with(
-                acquisition_function_name="qLogEHVI",
-                model=mm,
-                objective=weighted_obj,
-                X_observed=X_observed,
-                X_pending=X_pending,
-                constraints=cons_tfs,
-                mc_samples=128,
-                alpha=0.0,
-                seed=seed,
-                ref_point=new_obj_thresholds.tolist(),
-                Y=Y,
-            )
-
     # test infer objective thresholds alone
     @mock.patch(  # pyre-ignore
-        "ax.generators.torch.botorch_moo_defaults._check_posterior_type",
+        "ax.generators.torch.botorch_moo_utils._check_posterior_type",
         wraps=lambda y: y,
     )
     def test_infer_objective_thresholds(self, _, cuda: bool = False) -> None:
@@ -342,7 +261,7 @@ def test_infer_objective_thresholds(self, _, cuda: bool = False) -> None:
             with ExitStack() as es:
                 _mock_infer_reference_point = es.enter_context(
                     mock.patch(
-                        "ax.generators.torch.botorch_moo_defaults"
+                        "ax.generators.torch.botorch_moo_utils"
                         ".infer_reference_point",
                         wraps=infer_reference_point,
                     )
@@ -410,7 +329,7 @@ def test_infer_objective_thresholds(self, _, cuda: bool = False) -> None:
             with ExitStack() as es:
                 _mock_infer_reference_point = es.enter_context(
                     mock.patch(
-                        "ax.generators.torch.botorch_moo_defaults"
+                        "ax.generators.torch.botorch_moo_utils"
                         ".infer_reference_point",
                         wraps=infer_reference_point,
                     )
diff --git a/ax/generators/tests/test_torch_model_utils.py b/ax/generators/tests/test_torch_model_utils.py
index 07cf8b1795a..da642d94ce5 100644
--- a/ax/generators/tests/test_torch_model_utils.py
+++ b/ax/generators/tests/test_torch_model_utils.py
@@ -10,12 +10,7 @@
 
 import numpy as np
 import torch
-from ax.core.search_space import SearchSpaceDigest
-from ax.generators.torch.utils import (
-    _generate_sobol_points,
-    subset_model,
-    tensor_callable_to_array_callable,
-)
+from ax.generators.torch.utils import subset_model, tensor_callable_to_array_callable
 from ax.utils.common.testutils import TestCase
 from botorch.models import SingleTaskGP
 from botorch.models.deterministic import GenericDeterministicModel
@@ -27,28 +22,6 @@
 
 
 class TorchUtilsTest(TestCase):
-    def test_GenerateSobolPoints(self) -> None:
-        bounds = [(0.0, 1.0) for _ in range(3)]
-        linear_constraints = (
-            torch.tensor([[1, -1, 0]], dtype=torch.double),
-            torch.tensor([[0]], dtype=torch.double),
-        )
-
-        def test_rounding_func(x: Tensor) -> Tensor:
-            return x
-
-        gen_sobol = _generate_sobol_points(
-            n_sobol=100,
-            search_space_digest=SearchSpaceDigest(
-                feature_names=["a", "b", "c"], bounds=bounds
-            ),
-            device=torch.device("cpu"),
-            linear_constraints=linear_constraints,
-            rounding_func=test_rounding_func,
-        )
-        self.assertEqual(len(gen_sobol), 100)
-        self.assertIsInstance(gen_sobol, Tensor)
-
     def test_TensorCallableToArrayCallable(self) -> None:
         def tensor_func(x: Tensor) -> Tensor:
             return torch.pow(x, 2)
diff --git a/ax/generators/torch/botorch_defaults.py b/ax/generators/torch/botorch_defaults.py
deleted file mode 100644
index c6aabc8c6a2..00000000000
--- a/ax/generators/torch/botorch_defaults.py
+++ /dev/null
@@ -1,746 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import functools
-from collections.abc import Callable
-from copy import deepcopy
-from random import randint
-from typing import Any, Protocol
-
-import torch
-from ax.generators.torch.botorch_modular.optimizer_defaults import (
-    BATCH_LIMIT,
-    INIT_BATCH_LIMIT,
-    MAX_OPT_AGG_SIZE,
-)
-from botorch.acquisition import get_acquisition_function
-from botorch.acquisition.acquisition import AcquisitionFunction
-from botorch.acquisition.objective import ConstrainedMCObjective, GenericMCObjective
-from botorch.acquisition.utils import get_infeasible_cost
-from botorch.exceptions.errors import UnsupportedError
-from botorch.fit import fit_gpytorch_mll
-from botorch.models.gp_regression import SingleTaskGP
-from botorch.models.gp_regression_fidelity import SingleTaskMultiFidelityGP
-from botorch.models.gpytorch import GPyTorchModel
-from botorch.models.model import Model
-from botorch.models.model_list_gp_regression import ModelListGP
-from botorch.models.multitask import MultiTaskGP
-from botorch.models.transforms.input import Warp
-from botorch.optim.optimize import optimize_acqf
-from botorch.utils import (
-    get_objective_weights_transform,
-    get_outcome_constraint_transforms,
-)
-from botorch.utils.multi_objective.scalarization import get_chebyshev_scalarization
-from botorch.utils.transforms import is_ensemble
-from gpytorch.kernels import MaternKernel, ScaleKernel
-from gpytorch.kernels.kernel import Kernel
-from gpytorch.mlls.exact_marginal_log_likelihood import ExactMarginalLogLikelihood
-from gpytorch.mlls.leave_one_out_pseudo_likelihood import LeaveOneOutPseudoLikelihood
-from gpytorch.mlls.sum_marginal_log_likelihood import SumMarginalLogLikelihood
-from gpytorch.priors import Prior
-from gpytorch.priors.lkj_prior import LKJCovariancePrior
-from gpytorch.priors.torch_priors import GammaPrior, LogNormalPrior
-from torch import Tensor
-
-
-MIN_OBSERVED_NOISE_LEVEL = 1e-6
-NO_OBSERVED_POINTS_MESSAGE = (
-    "There are no observed points meeting all parameter "
-    "constraints or have all necessary metrics attached."
-)
-
-
-def _construct_model(
-    task_feature: int | None,
-    Xs: list[Tensor],
-    Ys: list[Tensor],
-    Yvars: list[Tensor],
-    fidelity_features: list[int],
-    metric_signatures: list[str],
-    use_input_warping: bool = False,
-    prior: dict[str, Any] | None = None,
-    *,
-    multitask_gp_ranks: dict[str, Prior | float] | None = None,
-    **kwargs: Any,
-) -> GPyTorchModel:
-    """
-    Figures out how to call `_get_model` depending on inputs. Used by
-    `get_and_fit_model`.
-    """
-    if task_feature is None:
-        if len(Xs) == 1:
-            # Use single output, single task GP
-            return _get_model(
-                X=Xs[0],
-                Y=Ys[0],
-                Yvar=Yvars[0],
-                task_feature=task_feature,
-                fidelity_features=fidelity_features,
-                use_input_warping=use_input_warping,
-                prior=deepcopy(prior),
-                **kwargs,
-            )
-        if all(torch.equal(Xs[0], X) for X in Xs[1:]) and not use_input_warping:
-            # Use batched multioutput, single task GP
-            # Require using a ModelListGP if using input warping
-            Y = torch.cat(Ys, dim=-1)
-            Yvar = torch.cat(Yvars, dim=-1)
-            return _get_model(
-                X=Xs[0],
-                Y=Y,
-                Yvar=Yvar,
-                task_feature=task_feature,
-                fidelity_features=fidelity_features,
-                prior=deepcopy(prior),
-                **kwargs,
-            )
-
-    if task_feature is None:
-        models = [
-            _get_model(
-                X=X,
-                Y=Y,
-                Yvar=Yvar,
-                use_input_warping=use_input_warping,
-                prior=deepcopy(prior),
-                **kwargs,
-            )
-            for X, Y, Yvar in zip(Xs, Ys, Yvars)
-        ]
-    else:
-        # use multi-task GP
-        mtgp_rank_dict = {} if multitask_gp_ranks is None else multitask_gp_ranks
-        # assembles list of ranks associated with each metric
-        if len({len(Xs), len(Ys), len(Yvars), len(metric_signatures)}) > 1:
-            raise ValueError(
-                "Lengths of Xs, Ys, Yvars, and metric_signatures must match. Your "
-                f"inputs have lengths {len(Xs)}, {len(Ys)}, {len(Yvars)}, and "
-                f"{len(metric_signatures)}, respectively."
-            )
-        mtgp_rank_list = [
-            mtgp_rank_dict.get(metric, None) for metric in metric_signatures
-        ]
-        models = [
-            _get_model(
-                X=X,
-                Y=Y,
-                Yvar=Yvar,
-                task_feature=task_feature,
-                rank=mtgp_rank,
-                use_input_warping=use_input_warping,
-                prior=deepcopy(prior),
-                **kwargs,
-            )
-            for X, Y, Yvar, mtgp_rank in zip(Xs, Ys, Yvars, mtgp_rank_list)
-        ]
-    return ModelListGP(*models)
-
-
-def get_and_fit_model(
-    Xs: list[Tensor],
-    Ys: list[Tensor],
-    Yvars: list[Tensor],
-    task_features: list[int],
-    fidelity_features: list[int],
-    metric_signatures: list[str],
-    state_dict: dict[str, Tensor] | None = None,
-    refit_model: bool = True,
-    use_input_warping: bool = False,
-    use_loocv_pseudo_likelihood: bool = False,
-    prior: dict[str, Any] | None = None,
-    *,
-    multitask_gp_ranks: dict[str, Prior | float] | None = None,
-    **kwargs: Any,
-) -> GPyTorchModel:
-    r"""Instantiates and fits a botorch GPyTorchModel using the given data.
-    N.B. Currently, the logic for choosing ModelListGP vs other models is handled
-    using if-else statements in lines 96-137. In the future, this logic should be
-    taken care of by modular botorch.
-
-    Args:
-        Xs: List of X data, one tensor per outcome.
-        Ys: List of Y data, one tensor per outcome.
-        Yvars: List of observed variance of Ys.
-        task_features: List of columns of X that are tasks.
-        fidelity_features: List of columns of X that are fidelity parameters.
-        metric_signatures: Signature of each outcome Y in Ys.
-        state_dict: If provided, will set model parameters to this state
-            dictionary. Otherwise, will fit the model.
-        refit_model: Flag for refitting model.
-        prior: Optional[Dict]. A dictionary that contains the specification of
-            GP model prior. Currently, the keys include:
-            - covar_module_prior: prior on covariance matrix e.g.
-                {"lengthscale_prior": GammaPrior(3.0, 6.0)}.
-            - type: type of prior on task covariance matrix e.g.`LKJCovariancePrior`.
-            - sd_prior: A scalar prior over nonnegative numbers, which is used for the
-                default LKJCovariancePrior task_covar_prior.
-            - eta: The eta parameter on the default LKJ task_covar_prior.
-        kwargs: Passed to `_get_model`.
-
-    Returns:
-        A fitted GPyTorchModel.
-    """
-
-    if len(fidelity_features) > 0 and len(task_features) > 0:
-        raise NotImplementedError(
-            "Currently do not support MF-GP models with task_features!"
-        )
-    if len(fidelity_features) > 1:
-        raise NotImplementedError(
-            "Fidelity MF-GP models currently support only a single fidelity parameter!"
-        )
-    if len(task_features) > 1:
-        raise NotImplementedError(
-            f"This model only supports 1 task feature (got {task_features})"
-        )
-    elif len(task_features) == 1:
-        task_feature = task_features[0]
-    else:
-        task_feature = None
-
-    model = _construct_model(
-        task_feature=task_feature,
-        Xs=Xs,
-        Ys=Ys,
-        Yvars=Yvars,
-        fidelity_features=fidelity_features,
-        metric_signatures=metric_signatures,
-        use_input_warping=use_input_warping,
-        prior=prior,
-        multitask_gp_ranks=multitask_gp_ranks,
-        **kwargs,
-    )
-
-    # TODO: Better logic for deciding when to use a ModelListGP. Currently the
-    # logic is unclear. The two cases in which ModelListGP is used are
-    # (i) the training inputs (Xs) are not the same for the different outcomes, and
-    # (ii) a multi-task model is used
-
-    model.to(Xs[0])
-    if state_dict is not None:
-        model.load_state_dict(state_dict)
-    if state_dict is None or refit_model:
-        # TODO: Add bounds for optimization stability - requires revamp upstream
-        bounds = {}
-        if use_loocv_pseudo_likelihood:
-            mll_cls = LeaveOneOutPseudoLikelihood
-        else:
-            mll_cls = ExactMarginalLogLikelihood
-        if isinstance(model, ModelListGP):
-            mll = SumMarginalLogLikelihood(model.likelihood, model, mll_cls=mll_cls)
-        else:
-            mll = mll_cls(model.likelihood, model)
-        mll = fit_gpytorch_mll(mll, optimizer_kwargs={"bounds": bounds})
-    return model
-
-
-class TAcqfConstructor(Protocol):
-    def __call__(
-        self,  # making this a static method makes Pyre unhappy, better to keep `self`
-        model: Model,
-        objective_weights: Tensor,
-        outcome_constraints: tuple[Tensor, Tensor] | None = None,
-        X_observed: Tensor | None = None,
-        X_pending: Tensor | None = None,
-        **kwargs: Any,
-    ) -> AcquisitionFunction: ...  # pragma: no cover
-
-
-def get_acqf(
-    acquisition_function_name: str,
-) -> Callable[[Callable[[], None]], TAcqfConstructor]:
-    """Returns a decorator whose wrapper function instantiates an acquisition function.
-
-    NOTE: This is a decorator factory instead of a simple factory as serialization
-    of Botorch model kwargs requires callables to be have module-level paths, and
-    closures created by a simple factory do not have such paths. We solve this by
-    wrapping "empty" module-level functions with this decorator, we ensure that they
-    are serialized correctly, in addition to reducing code duplication.
-
-    Example:
-        >>> @get_acqf("qEI")
-        ... def get_qEI() -> None:
-        ...     pass
-        >>> acqf = get_qEI(
-        ...     model=model,
-        ...     objective_weights=objective_weights,
-        ...     outcome_constraints=outcome_constraints,
-        ...     X_observed=X_observed,
-        ...     X_pending=X_pending,
-        ...     **kwargs,
-        ... )
-        >>> type(acqf)
-        ... botorch.acquisition.monte_carlo.qExpectedImprovement
-
-    Args:
-        acquisition_function_name: The name of the acquisition function to be
-            instantiated by the returned function.
-
-    Returns:
-        A decorator whose wrapper function is a TAcqfConstructor, i.e. it requires a
-        `model`, `objective_weights`, and optional `outcome_constraints`, `X_observed`,
-        and `X_pending` as inputs, as well as `kwargs`, and returns an
-        `AcquisitionFunction` instance that corresponds to `acquisition_function_name`.
-    """
-
-    def decorator(empty_acqf_getter: Callable[[], None]) -> TAcqfConstructor:
-        # `wraps` allows the function to keep its original, module-level name, enabling
-        # serialization via `callable_to_reference`. `empty_acqf_getter` is otherwise
-        # not used in the wrapper.
-        @functools.wraps(empty_acqf_getter)
-        def wrapper(
-            model: Model,
-            objective_weights: Tensor,
-            outcome_constraints: tuple[Tensor, Tensor] | None = None,
-            X_observed: Tensor | None = None,
-            X_pending: Tensor | None = None,
-            **kwargs: Any,
-        ) -> AcquisitionFunction:
-            kwargs.pop("objective_thresholds", None)
-            return _get_acquisition_func(
-                model=model,
-                acquisition_function_name=acquisition_function_name,
-                objective_weights=objective_weights,
-                outcome_constraints=outcome_constraints,
-                X_observed=X_observed,
-                X_pending=X_pending,
-                **kwargs,
-            )
-
-        return wrapper
-
-    return decorator
-
-
-@get_acqf("qEI")
-def get_qEI() -> None:
-    """A TAcqfConstructor to instantiate a qEI acquisition function. The function body
-    is filled in by the decorator function `get_acqf` to simultaneously reduce code
-    duplication and allow serialization in Ax. TODO: Deprecate with legacy Ax model.
-    """
-
-
-@get_acqf("qLogEI")
-def get_qLogEI() -> None:
-    """TAcqfConstructor instantiating qLogEI. See docstring of get_qEI for details."""
-
-
-@get_acqf("qNEI")
-def get_NEI() -> None:  # no "q" in method name for backward compatibility
-    """TAcqfConstructor instantiating qNEI. See docstring of get_qEI for details."""
-
-
-@get_acqf("qLogNEI")
-def get_qLogNEI() -> None:
-    """TAcqfConstructor instantiating qLogNEI. See docstring of get_qEI for details."""
-
-
-def _get_acquisition_func(
-    model: Model,
-    acquisition_function_name: str,
-    objective_weights: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    mc_objective: type[GenericMCObjective] = GenericMCObjective,
-    constrained_mc_objective: None
-    | (type[ConstrainedMCObjective]) = ConstrainedMCObjective,
-    # pyre-fixme[24]: Generic type `dict` expects 2 type parameters, use
-    #  `typing.Dict` to avoid runtime subscripting errors.
-    mc_objective_kwargs: dict | None = None,
-    *,
-    chebyshev_scalarization: bool = False,
-    prune_baseline: bool = True,
-    mc_samples: int = 512,
-    marginalize_dim: int | None = None,
-) -> AcquisitionFunction:
-    r"""Instantiates a acquisition function.
-
-    Args:
-        model: The underlying model which the acqusition function uses
-            to estimate acquisition values of candidates.
-        acquisition_function_name: Name of the acquisition function.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. (Not used by single task models)
-        X_observed: A tensor containing points observed for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        X_pending: A tensor containing points whose evaluation is pending (i.e.
-            that have been submitted for evaluation) present for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        mc_objective: GenericMCObjective class, used for constructing a
-            MC-objective. If constructing a penalized MC-objective, pass in
-            PenalizedMCObjective together with mc_objective_kwargs .
-        constrained_mc_objective: ConstrainedMCObjective class, used when
-            applying constraints on the outcomes.
-        mc_objective_kwargs: kwargs for constructing MC-objective.
-            For GenericMCObjective, leave it as None. For PenalizedMCObjective,
-            it needs to be specified in the format of kwargs.
-        mc_samples: The number of MC samples to use (default: 512).
-        prune_baseline: If True, prune the baseline points for NEI (default: True).
-        chebyshev_scalarization: Use augmented Chebyshev scalarization.
-
-    Returns:
-        The instantiated acquisition function.
-    """
-    if acquisition_function_name not in [
-        "qSR",
-        "qEI",
-        "qLogEI",
-        "qPI",
-        "qNEI",
-        "qLogNEI",
-    ]:
-        raise NotImplementedError(f"{acquisition_function_name=} not implemented yet.")
-
-    if X_observed is None:
-        raise ValueError(NO_OBSERVED_POINTS_MESSAGE)
-    # construct Objective module
-    if chebyshev_scalarization:
-        with torch.no_grad():
-            Y = model.posterior(X_observed).mean  # pyre-ignore [16]
-        if is_ensemble(model):
-            Y = torch.mean(Y, dim=0)
-        obj_tf = get_chebyshev_scalarization(weights=objective_weights, Y=Y)
-    else:
-        obj_tf = get_objective_weights_transform(objective_weights)
-
-    # pyre-fixme[53]: Captured variable `obj_tf` is not annotated.
-    def objective(samples: Tensor, X: Tensor | None = None) -> Tensor:
-        return obj_tf(samples)
-
-    mc_objective_kwargs = {} if mc_objective_kwargs is None else mc_objective_kwargs
-    objective = mc_objective(objective=objective, **mc_objective_kwargs)
-
-    if outcome_constraints is None:
-        con_tfs = None
-    else:
-        con_tfs = get_outcome_constraint_transforms(outcome_constraints)
-        # All acquisition functions registered in BoTorch's `get_acquisition_function`
-        # except qSR and qUCB support a principled treatment of the constraints by
-        # directly passing them to the constructor.
-        if acquisition_function_name == "qSR":
-            if constrained_mc_objective is None:
-                raise ValueError(
-                    "constrained_mc_objective cannot be set to None "
-                    "when applying outcome constraints."
-                )
-
-            inf_cost = get_infeasible_cost(
-                X=X_observed, model=model, objective=objective
-            )
-            objective = constrained_mc_objective(
-                objective=objective, constraints=con_tfs or [], infeasible_cost=inf_cost
-            )
-
-    return get_acquisition_function(
-        acquisition_function_name=acquisition_function_name,
-        model=model,
-        objective=objective,
-        X_observed=X_observed,
-        X_pending=X_pending,
-        prune_baseline=prune_baseline,
-        mc_samples=mc_samples,
-        seed=randint(1, 10000),
-        marginalize_dim=marginalize_dim,
-        constraints=con_tfs,
-    )
-
-
-def scipy_optimizer(
-    acq_function: AcquisitionFunction,
-    bounds: Tensor,
-    n: int,
-    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
-    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
-    fixed_features: dict[int, float] | None = None,
-    rounding_func: Callable[[Tensor], Tensor] | None = None,
-    *,
-    num_restarts: int = 20,
-    raw_samples: int | None = None,
-    joint_optimization: bool = False,
-    options: dict[str, bool | float | int | str] | None = None,
-) -> tuple[Tensor, Tensor]:
-    r"""Optimizer using scipy's minimize module on a numpy-adpator.
-
-    Args:
-        acq_function: A botorch AcquisitionFunction.
-        bounds: A `2 x d`-dim tensor, where `bounds[0]` (`bounds[1]`) are the
-            lower (upper) bounds of the feasible hyperrectangle.
-        n: The number of candidates to generate.
-        inequality constraints: A list of tuples (indices, coefficients, rhs),
-            with each tuple encoding an inequality constraint of the form
-            `\sum_i (X[indices[i]] * coefficients[i]) >= rhs`
-        equality constraints: A list of tuples (indices, coefficients, rhs),
-            with each tuple encoding an equality constraint of the form
-            `\sum_i (X[indices[i]] * coefficients[i]) == rhs`
-        fixed_features: A map {feature_index: value} for features that should
-            be fixed to a particular value during generation.
-        rounding_func: A function that rounds an optimization result
-            appropriately (i.e., according to `round-trip` transformations).
-
-    Returns:
-        2-element tuple containing
-
-        - A `n x d`-dim tensor of generated candidates.
-        - In the case of joint optimization, a scalar tensor containing
-          the joint acquisition value of the `n` points. In the case of
-          sequential optimization, a `n`-dim tensor of conditional acquisition
-          values, where `i`-th element is the expected acquisition value
-          conditional on having observed candidates `0,1,...,i-1`.
-    """
-
-    sequential = not joint_optimization
-    optimize_acqf_options: dict[str, bool | float | int | str] = {
-        "batch_limit": BATCH_LIMIT,
-        "init_batch_limit": INIT_BATCH_LIMIT,
-        "max_optimization_problem_aggregation_size": MAX_OPT_AGG_SIZE,
-    }
-    if options is not None:
-        optimize_acqf_options.update(options)
-    X, expected_acquisition_value = optimize_acqf(
-        acq_function=acq_function,
-        bounds=bounds,
-        q=n,
-        num_restarts=num_restarts,
-        raw_samples=50 * num_restarts if raw_samples is None else raw_samples,
-        options=optimize_acqf_options,
-        inequality_constraints=inequality_constraints,
-        equality_constraints=equality_constraints,
-        fixed_features=fixed_features,
-        sequential=sequential,
-        post_processing_func=rounding_func,
-    )
-    return X, expected_acquisition_value
-
-
-def _get_model(
-    X: Tensor,
-    Y: Tensor,
-    Yvar: Tensor,
-    task_feature: int | None = None,
-    fidelity_features: list[int] | None = None,
-    use_input_warping: bool = False,
-    covar_module: Kernel | None = None,
-    prior: dict[str, Any] | None = None,
-    **kwargs: Any,
-) -> GPyTorchModel:
-    """Instantiate a model of type depending on the input data.
-
-    Args:
-        X: A `n x d` tensor of input features.
-        Y: A `n x m` tensor of input observations.
-        Yvar: A `n x m` tensor of input variances (NaN if unobserved).
-        task_feature: The index of the column pertaining to the task feature
-            (if present).
-        fidelity_features: List of columns of X that are fidelity parameters.
-        covar_module: Optional. A data kernel of GP model.
-        prior: Optional[Dict]. A dictionary that contains the specification of
-            GP model prior. Currently, the keys include:
-            - covar_module_prior: prior on covariance matrix e.g.
-                {"lengthscale_prior": GammaPrior(3.0, 6.0)}.
-            - type: type of prior on task covariance matrix e.g.`LKJCovariancePrior`.
-            - sd_prior: A scalar prior over nonnegative numbers, which is used for the
-                default LKJCovariancePrior task_covar_prior.
-            - eta: The eta parameter on the default LKJ task_covar_prior.
-
-    Returns:
-        A GPyTorchModel (unfitted).
-    """
-    Yvar = Yvar.clamp_min(MIN_OBSERVED_NOISE_LEVEL)
-    is_nan = torch.isnan(Yvar)
-    any_nan_Yvar = torch.any(is_nan)
-    all_nan_Yvar = torch.all(is_nan)
-    if any_nan_Yvar and not all_nan_Yvar:
-        if task_feature:
-            # TODO (jej): Replace with inferred noise before making perf judgements.
-            Yvar[Yvar != Yvar] = MIN_OBSERVED_NOISE_LEVEL
-        else:
-            raise ValueError(
-                "Mix of known and unknown variances indicates valuation function "
-                "errors. Variances should all be specified, or none should be."
-            )
-    if use_input_warping:
-        if Y.shape[-1] > 1 and X.ndim > 2:
-            raise UnsupportedError(
-                "Input warping is not supported for batched multi output models."
-            )
-        warp_tf = get_warping_transform(
-            d=X.shape[-1],
-            task_feature=task_feature,
-            batch_shape=X.shape[:-2],
-        )
-    else:
-        warp_tf = None
-    if fidelity_features is None:
-        fidelity_features = []
-    if len(fidelity_features) == 0:
-        # only pass linear_truncated arg if there are fidelities
-        kwargs = {k: v for k, v in kwargs.items() if k != "linear_truncated"}
-    # construct kernel based on customized prior if covar_module is None
-    prior_dict = prior or {}
-    covar_module_prior_dict = prior_dict.pop("covar_module_prior", None)
-    if (covar_module_prior_dict is not None) and (covar_module is None):
-        covar_module = _get_customized_covar_module(
-            covar_module_prior_dict=covar_module_prior_dict,
-            ard_num_dims=X.shape[-1],
-            aug_batch_shape=_get_aug_batch_shape(X, Y),
-            task_feature=task_feature,
-        )
-
-    if len(fidelity_features) > 0:
-        if task_feature:
-            raise NotImplementedError(
-                "multi-task multi-fidelity models not yet available"
-            )
-        # at this point we can assume that there is only a single fidelity parameter
-        gp = SingleTaskMultiFidelityGP(
-            train_X=X,
-            train_Y=Y,
-            data_fidelities=fidelity_features[:1],
-            input_transform=warp_tf,
-            **kwargs,
-        )
-    elif task_feature is None:
-        gp = SingleTaskGP(
-            train_X=X,
-            train_Y=Y,
-            train_Yvar=None if all_nan_Yvar else Yvar,
-            covar_module=covar_module,
-            input_transform=warp_tf,
-            **{"outcome_transform": None, **kwargs},
-        )
-    else:
-        # instantiate multitask GP
-        all_tasks, _, _ = MultiTaskGP.get_all_tasks(X, task_feature)
-        num_tasks = len(all_tasks)
-        task_covar_prior = None
-        if len(prior_dict) > 0:
-            prior_type = prior_dict.get("type", None)
-            if issubclass(prior_type, LKJCovariancePrior):
-                sd_prior = prior_dict.get("sd_prior", GammaPrior(1.0, 0.15))
-                sd_prior._event_shape = torch.Size([num_tasks])
-                eta = prior_dict.get("eta", 0.5)
-                if not isinstance(eta, float) and not isinstance(eta, int):
-                    raise ValueError(f"eta must be a real number, your eta was {eta}")
-                task_covar_prior = LKJCovariancePrior(num_tasks, eta, sd_prior)
-
-            else:
-                raise NotImplementedError(
-                    "Currently only LKJ prior is supported,"
-                    f"your prior type was {prior_type}."
-                )
-
-        gp = MultiTaskGP(
-            train_X=X,
-            train_Y=Y,
-            train_Yvar=None if all_nan_Yvar else Yvar,
-            task_feature=task_feature,
-            covar_module=covar_module,
-            rank=kwargs.get("rank"),
-            task_covar_prior=task_covar_prior,
-            input_transform=warp_tf,
-            # specify output_tasks so that model.num_outputs
-            # is 1, since the model is only modeling
-            # a since metric.
-            output_tasks=all_tasks[:1],
-        )
-    return gp
-
-
-def _get_customized_covar_module(
-    covar_module_prior_dict: dict[str, Prior],
-    ard_num_dims: int,
-    aug_batch_shape: torch.Size,
-    task_feature: int | None = None,
-) -> Kernel:
-    """Construct a GP kernel based on customized prior dict.
-
-    Args:
-        covar_module_prior_dict: Dict. The keys are the names of the prior and values
-            are the priors. e.g. {"lengthscale_prior": GammaPrior(3.0, 6.0)}.
-        ard_num_dims: The dimension of the inputs, including task features.
-        aug_batch_shape: The output dimension augmented batch shape of the model
-            (different from the batch shape for batched multi-output models).
-        task_feature: The index of the task feature.
-    """
-    # TODO: add more checks of covar_module_prior_dict
-    if task_feature is not None:
-        ard_num_dims -= 1
-    return ScaleKernel(
-        MaternKernel(
-            nu=2.5,
-            ard_num_dims=ard_num_dims,
-            batch_shape=aug_batch_shape,
-            lengthscale_prior=covar_module_prior_dict.get(
-                "lengthscale_prior", GammaPrior(3.0, 6.0)
-            ),
-        ),
-        batch_shape=aug_batch_shape,
-        outputscale_prior=covar_module_prior_dict.get(
-            "outputscale_prior", GammaPrior(2.0, 0.15)
-        ),
-    )
-
-
-def _get_aug_batch_shape(X: Tensor, Y: Tensor) -> torch.Size:
-    """Obtain the output-augmented batch shape of GP model.
-
-    Args:
-        X: A `(input_batch_shape) x n x d` tensor of input features.
-        Y: A `n x m` tensor of input observations.
-
-    Returns:
-        The output-augmented batch shape: `input_batch_shape x (m)`
-    """
-    batch_shape = X.shape[:-2]
-    num_outputs = Y.shape[-1]
-    if num_outputs > 1:
-        batch_shape += torch.Size([num_outputs])  # pyre-ignore
-    return batch_shape
-
-
-def get_warping_transform(
-    d: int,
-    batch_shape: torch.Size | None = None,
-    task_feature: int | None = None,
-) -> Warp:
-    """Construct input warping transform.
-
-    Args:
-        d: The dimension of the input, including task features
-        batch_shape: The batch_shape of the model
-        task_feature: The index of the task feature
-
-    Returns:
-        The input warping transform.
-    """
-    indices = list(range(d))
-    # apply warping to all non-task features, including fidelity features
-    if task_feature is not None:
-        del indices[task_feature]
-    # Legacy Ax models operate in the unit cube
-    bounds = torch.zeros(2, d, dtype=torch.double)
-    bounds[1] = 1
-    # Note: this currently uses the same warping functions for all tasks
-    tf = Warp(
-        d=d,
-        indices=indices,
-        # prior with a median of 1
-        concentration1_prior=LogNormalPrior(0.0, 0.75**0.5),
-        concentration0_prior=LogNormalPrior(0.0, 0.75**0.5),
-        batch_shape=batch_shape,
-        # Legacy Ax models operate in the unit cube
-        bounds=bounds,
-    )
-    return tf
diff --git a/ax/generators/torch/botorch_modular/acquisition.py b/ax/generators/torch/botorch_modular/acquisition.py
index d57528104ad..629a8311c7a 100644
--- a/ax/generators/torch/botorch_modular/acquisition.py
+++ b/ax/generators/torch/botorch_modular/acquisition.py
@@ -25,7 +25,7 @@
 )
 from ax.generators.torch.botorch_modular.optimizer_argparse import optimizer_argparse
 from ax.generators.torch.botorch_modular.surrogate import Surrogate
-from ax.generators.torch.botorch_moo_defaults import infer_objective_thresholds
+from ax.generators.torch.botorch_moo_utils import infer_objective_thresholds
 from ax.generators.torch.utils import (
     _get_X_pending_and_observed,
     get_botorch_objective_and_transform,
diff --git a/ax/generators/torch/botorch_moo_defaults.py b/ax/generators/torch/botorch_moo_defaults.py
deleted file mode 100644
index 6dac5340275..00000000000
--- a/ax/generators/torch/botorch_moo_defaults.py
+++ /dev/null
@@ -1,775 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-"""
-References
-
-.. [Daulton2020qehvi]
-    S. Daulton, M. Balandat, and E. Bakshy. Differentiable Expected Hypervolume
-    Improvement for Parallel Multi-Objective Bayesian Optimization. Advances in Neural
-    Information Processing Systems 33, 2020.
-
-.. [Daulton2021nehvi]
-    S. Daulton, M. Balandat, and E. Bakshy. Parallel Bayesian Optimization of
-    Multiple Noisy Objectives with Expected Hypervolume Improvement. Advances
-    in Neural Information Processing Systems 34, 2021.
-
-.. [Ament2023logei]
-    S. Ament, S. Daulton, D. Eriksson, M. Balandat, and E. Bakshy.
-    Unexpected Improvements to Expected Improvement for Bayesian Optimization. Advances
-    in Neural Information Processing Systems 36, 2023.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Callable
-from typing import cast, Optional, Union
-
-import torch
-from ax.exceptions.core import AxError
-from ax.generators.torch.botorch_defaults import NO_OBSERVED_POINTS_MESSAGE
-from ax.generators.torch.botorch_modular.optimizer_defaults import (
-    BATCH_LIMIT,
-    INIT_BATCH_LIMIT,
-)
-from ax.generators.torch.utils import get_outcome_constraint_transforms, subset_model
-from ax.generators.torch_base import TorchGenerator
-from botorch.acquisition import get_acquisition_function
-from botorch.acquisition.acquisition import AcquisitionFunction
-from botorch.acquisition.multi_objective.logei import (
-    qLogExpectedHypervolumeImprovement,
-    qLogNoisyExpectedHypervolumeImprovement,
-)
-from botorch.acquisition.multi_objective.monte_carlo import (
-    qExpectedHypervolumeImprovement,
-    qNoisyExpectedHypervolumeImprovement,
-)
-from botorch.acquisition.multi_objective.objective import WeightedMCMultiOutputObjective
-from botorch.acquisition.multi_objective.utils import get_default_partitioning_alpha
-from botorch.models.model import Model
-from botorch.optim.optimize import optimize_acqf_list
-from botorch.posteriors.gpytorch import GPyTorchPosterior
-from botorch.posteriors.posterior import Posterior
-from botorch.posteriors.posterior_list import PosteriorList
-from botorch.utils.multi_objective.hypervolume import infer_reference_point
-from botorch.utils.multi_objective.pareto import is_non_dominated
-from pyre_extensions import assert_is_instance, none_throws
-from torch import Tensor
-
-DEFAULT_EHVI_MC_SAMPLES = 128
-
-
-# Callable that takes tensors of observations and model parameters,
-# then returns means of observations that make up a pareto frontier,
-# along with their covariances and their index in the input observations.
-TFrontierEvaluator = Callable[
-    [
-        TorchGenerator,
-        Tensor,
-        Optional[Tensor],
-        Optional[Tensor],
-        Optional[Tensor],
-        Optional[Tensor],
-        Optional[tuple[Tensor, Tensor]],
-    ],
-    tuple[Tensor, Tensor, Tensor],
-]
-
-NO_FEASIBLE_POINTS_MESSAGE = (
-    " Cannot infer objective thresholds due to no observed feasible points. "
-    " This likely means that one or more outcome constraints is set too strictly.  "
-    " Consider adding thresholds to your objectives to bypass this error."
-)
-
-
-def get_weighted_mc_objective_and_objective_thresholds(
-    objective_weights: Tensor, objective_thresholds: Tensor
-) -> tuple[WeightedMCMultiOutputObjective, Tensor]:
-    r"""Construct weighted objective and apply the weights to objective thresholds.
-
-    Args:
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        objective_thresholds: A tensor containing thresholds forming a reference point
-            from which to calculate pareto frontier hypervolume. Points that do not
-            dominate the objective_thresholds contribute nothing to hypervolume.
-
-    Returns:
-        A two-element tuple with the objective and objective thresholds:
-
-            - The objective
-            - The objective thresholds
-
-    """
-    nonzero_idcs = objective_weights.nonzero(as_tuple=False).view(-1)
-    objective_weights = objective_weights[nonzero_idcs]
-    objective_thresholds = objective_thresholds[nonzero_idcs]
-    objective = WeightedMCMultiOutputObjective(
-        weights=objective_weights, outcomes=nonzero_idcs.tolist()
-    )
-    objective_thresholds = torch.mul(objective_thresholds, objective_weights)
-    return objective, objective_thresholds
-
-
-def get_NEHVI(
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    prune_baseline: bool = True,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    marginalize_dim: int | None = None,
-    cache_root: bool = True,
-    seed: int | None = None,
-) -> qNoisyExpectedHypervolumeImprovement:
-    r"""Instantiates a qNoisyExpectedHyperVolumeImprovement acquisition function.
-
-    Args:
-        model: The underlying model which the acqusition function uses
-            to estimate acquisition values of candidates.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. (Not used by single task models)
-        X_observed: A tensor containing points observed for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        X_pending: A tensor containing points whose evaluation is pending (i.e.
-            that have been submitted for evaluation) present for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        prune_baseline: If True, prune the baseline points for NEI (default: True).
-        mc_samples: The number of MC samples to use (default: 512).
-        alpha: The hyperparameter controlling the approximate non-dominated
-            partitioning. The default value of 0.0 means an exact partitioning
-            is used. As the number of objectives `m` increases, consider increasing
-            this parameter in order to limit computational complexity (default: None).
-        marginalize_dim: The dimension along which to marginalize over, used for fully
-            Bayesian models (default: None).
-        cache_root: If True, cache the root of the covariance matrix (default: True).
-        seed: The random seed for generating random starting points for optimization (
-            default: None).
-
-    Returns:
-        qNoisyExpectedHyperVolumeImprovement: The instantiated acquisition function.
-    """
-    return assert_is_instance(
-        _get_NEHVI(
-            acqf_name="qNEHVI",
-            model=model,
-            objective_weights=objective_weights,
-            objective_thresholds=objective_thresholds,
-            outcome_constraints=outcome_constraints,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            prune_baseline=prune_baseline,
-            mc_samples=mc_samples,
-            alpha=alpha,
-            marginalize_dim=marginalize_dim,
-            cache_root=cache_root,
-            seed=seed,
-        ),
-        qNoisyExpectedHypervolumeImprovement,
-    )
-
-
-def get_qLogNEHVI(
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    prune_baseline: bool = True,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    marginalize_dim: int | None = None,
-    cache_root: bool = True,
-    seed: int | None = None,
-) -> qLogNoisyExpectedHypervolumeImprovement:
-    r"""Instantiates a qLogNoisyExpectedHyperVolumeImprovement acquisition function.
-
-    Args:
-        model: The underlying model which the acqusition function uses
-            to estimate acquisition values of candidates.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. (Not used by single task models)
-        X_observed: A tensor containing points observed for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        X_pending: A tensor containing points whose evaluation is pending (i.e.
-            that have been submitted for evaluation) present for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        prune_baseline: If True, prune the baseline points for NEI (default: True).
-        mc_samples: The number of MC samples to use (default: 512).
-        alpha: The hyperparameter controlling the approximate non-dominated
-            partitioning. The default value of 0.0 means an exact partitioning
-            is used. As the number of objectives `m` increases, consider increasing
-            this parameter in order to limit computational complexity (default: None).
-        marginalize_dim: The dimension along which to marginalize over, used for fully
-            Bayesian models (default: None).
-        cache_root: If True, cache the root of the covariance matrix (default: True).
-        seed: The random seed for generating random starting points for optimization (
-            default: None).
-
-    Returns:
-        qLogNoisyExpectedHyperVolumeImprovement: The instantiated acquisition function.
-    """
-    return assert_is_instance(
-        _get_NEHVI(
-            acqf_name="qLogNEHVI",
-            model=model,
-            objective_weights=objective_weights,
-            objective_thresholds=objective_thresholds,
-            outcome_constraints=outcome_constraints,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            prune_baseline=prune_baseline,
-            mc_samples=mc_samples,
-            alpha=alpha,
-            marginalize_dim=marginalize_dim,
-            cache_root=cache_root,
-            seed=seed,
-        ),
-        qLogNoisyExpectedHypervolumeImprovement,
-    )
-
-
-def _get_NEHVI(
-    acqf_name: str,
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    prune_baseline: bool = True,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    marginalize_dim: int | None = None,
-    cache_root: bool = True,
-    seed: int | None = None,
-) -> qNoisyExpectedHypervolumeImprovement | qLogNoisyExpectedHypervolumeImprovement:
-    if X_observed is None:
-        raise ValueError(NO_OBSERVED_POINTS_MESSAGE)
-    # construct Objective module
-    (
-        objective,
-        objective_thresholds,
-    ) = get_weighted_mc_objective_and_objective_thresholds(
-        objective_weights=objective_weights, objective_thresholds=objective_thresholds
-    )
-    # For EHVI acquisition functions we pass the constraint transform directly.
-    if outcome_constraints is None:
-        cons_tfs = None
-    else:
-        cons_tfs = get_outcome_constraint_transforms(outcome_constraints)
-    num_objectives = objective_thresholds.shape[0]
-    if alpha is None:
-        alpha = get_default_partitioning_alpha(num_objectives=num_objectives)
-    # NOTE: Not using checked_cast here because for Python 3.9, isinstance fails with
-    # `TypeError: Subscripted generics cannot be used with class and instance checks`.
-    return cast(
-        Union[
-            qNoisyExpectedHypervolumeImprovement,
-            qLogNoisyExpectedHypervolumeImprovement,
-        ],
-        get_acquisition_function(
-            acquisition_function_name=acqf_name,
-            model=model,
-            objective=objective,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            constraints=cons_tfs,
-            prune_baseline=prune_baseline,
-            mc_samples=mc_samples,
-            alpha=alpha,
-            seed=(
-                seed
-                if seed is not None
-                else cast(int, torch.randint(1, 10000, (1,)).item())
-            ),
-            ref_point=objective_thresholds.tolist(),
-            marginalize_dim=marginalize_dim,
-            cache_root=cache_root,
-        ),
-    )
-
-
-def get_EHVI(
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    seed: int | None = None,
-) -> qExpectedHypervolumeImprovement:
-    r"""Instantiates a qExpectedHyperVolumeImprovement acquisition function.
-
-    Args:
-        model: The underlying model which the acqusition function uses
-            to estimate acquisition values of candidates.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        objective_thresholds:  A tensor containing thresholds forming a reference point
-            from which to calculate pareto frontier hypervolume. Points that do not
-            dominate the objective_thresholds contribute nothing to hypervolume.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. (Not used by single task models)
-        X_observed: A tensor containing points observed for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        X_pending: A tensor containing points whose evaluation is pending (i.e.
-            that have been submitted for evaluation) present for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        mc_samples: The number of MC samples to use (default: 512).
-        alpha: The hyperparameter controlling the approximate non-dominated
-            partitioning. The default value of 0.0 means an exact partitioning
-            is used. As the number of objectives `m` increases, consider increasing
-            this parameter in order to limit computational complexity.
-        seed: The random seed for generating random starting points for optimization.
-
-    Returns:
-        qExpectedHypervolumeImprovement: The instantiated acquisition function.
-    """
-    return assert_is_instance(
-        _get_EHVI(
-            acqf_name="qEHVI",
-            model=model,
-            objective_weights=objective_weights,
-            objective_thresholds=objective_thresholds,
-            outcome_constraints=outcome_constraints,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            mc_samples=mc_samples,
-            alpha=alpha,
-            seed=seed,
-        ),
-        qExpectedHypervolumeImprovement,
-    )
-
-
-def get_qLogEHVI(
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    seed: int | None = None,
-) -> qLogExpectedHypervolumeImprovement:
-    r"""Instantiates a qLogExpectedHyperVolumeImprovement acquisition function.
-
-    Args:
-        model: The underlying model which the acqusition function uses
-            to estimate acquisition values of candidates.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights.
-        objective_thresholds:  A tensor containing thresholds forming a reference point
-            from which to calculate pareto frontier hypervolume. Points that do not
-            dominate the objective_thresholds contribute nothing to hypervolume.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. (Not used by single task models)
-        X_observed: A tensor containing points observed for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        X_pending: A tensor containing points whose evaluation is pending (i.e.
-            that have been submitted for evaluation) present for all objective
-            outcomes and outcomes that appear in the outcome constraints (if
-            there are any).
-        mc_samples: The number of MC samples to use (default: 512).
-        alpha: The hyperparameter controlling the approximate non-dominated
-            partitioning. The default value of 0.0 means an exact partitioning
-            is used. As the number of objectives `m` increases, consider increasing
-            this parameter in order to limit computational complexity.
-        seed: The random seed for generating random starting points for optimization.
-
-    Returns:
-        qLogExpectedHypervolumeImprovement: The instantiated acquisition function.
-    """
-    return assert_is_instance(
-        _get_EHVI(
-            acqf_name="qLogEHVI",
-            model=model,
-            objective_weights=objective_weights,
-            objective_thresholds=objective_thresholds,
-            outcome_constraints=outcome_constraints,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            mc_samples=mc_samples,
-            alpha=alpha,
-            seed=seed,
-        ),
-        qLogExpectedHypervolumeImprovement,
-    )
-
-
-def _get_EHVI(
-    acqf_name: str,
-    model: Model,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    X_observed: Tensor | None = None,
-    X_pending: Tensor | None = None,
-    *,
-    mc_samples: int = DEFAULT_EHVI_MC_SAMPLES,
-    alpha: float | None = None,
-    seed: int | None = None,
-) -> qExpectedHypervolumeImprovement | qLogExpectedHypervolumeImprovement:
-    if X_observed is None:
-        raise ValueError(NO_OBSERVED_POINTS_MESSAGE)
-    # construct Objective module
-    (
-        objective,
-        objective_thresholds,
-    ) = get_weighted_mc_objective_and_objective_thresholds(
-        objective_weights=objective_weights, objective_thresholds=objective_thresholds
-    )
-    with torch.no_grad():
-        Y = _check_posterior_type(model.posterior(X_observed)).mean
-    # For EHVI acquisition functions we pass the constraint transform directly.
-    if outcome_constraints is None:
-        cons_tfs = None
-    else:
-        cons_tfs = get_outcome_constraint_transforms(
-            outcome_constraints=outcome_constraints
-        )
-    num_objectives = objective_thresholds.shape[0]
-    # NOTE: Not using checked_cast here because for Python 3.9, isinstance fails with
-    # `TypeError: Subscripted generics cannot be used with class and instance checks`.
-    return cast(
-        Union[qExpectedHypervolumeImprovement, qLogExpectedHypervolumeImprovement],
-        get_acquisition_function(
-            acquisition_function_name=acqf_name,
-            model=model,
-            objective=objective,
-            X_observed=X_observed,
-            X_pending=X_pending,
-            constraints=cons_tfs,
-            mc_samples=mc_samples,
-            alpha=(
-                get_default_partitioning_alpha(num_objectives=num_objectives)
-                if alpha is None
-                else alpha
-            ),
-            seed=(
-                seed
-                if seed is not None
-                else cast(int, torch.randint(1, 10000, (1,)).item())
-            ),
-            ref_point=objective_thresholds.tolist(),
-            Y=Y,
-        ),
-    )
-
-
-# TODO (jej): rewrite optimize_acqf wrappers to avoid duplicate code.
-def scipy_optimizer_list(
-    acq_function_list: list[AcquisitionFunction],
-    bounds: Tensor,
-    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
-    fixed_features: dict[int, float] | None = None,
-    rounding_func: Callable[[Tensor], Tensor] | None = None,
-    num_restarts: int = 20,
-    raw_samples: int | None = None,
-    options: dict[str, bool | float | int | str] | None = None,
-) -> tuple[Tensor, Tensor]:
-    r"""Sequential optimizer using scipy's minimize module on a numpy-adapter.
-
-    The ith acquisition in the sequence uses the ith given acquisition_function.
-
-    Args:
-        acq_function_list: A list of botorch AcquisitionFunctions,
-            optimized sequentially.
-        bounds: A `2 x d`-dim tensor, where `bounds[0]` (`bounds[1]`) are the
-            lower (upper) bounds of the feasible hyperrectangle.
-        n: The number of candidates to generate.
-        inequality constraints: A list of tuples (indices, coefficients, rhs),
-            with each tuple encoding an inequality constraint of the form
-            `\sum_i (X[indices[i]] * coefficients[i]) >= rhs`
-        fixed_features: A map {feature_index: value} for features that should
-            be fixed to a particular value during generation.
-        rounding_func: A function that rounds an optimization result
-            appropriately (i.e., according to `round-trip` transformations).
-
-    Returns:
-        2-element tuple containing
-
-        - A `n x d`-dim tensor of generated candidates.
-        - A `n`-dim tensor of conditional acquisition
-          values, where `i`-th element is the expected acquisition value
-          conditional on having observed candidates `0,1,...,i-1`.
-    """
-    # Use SLSQP by default for small problems since it yields faster wall times.
-    optimize_options: dict[str, bool | float | int | str] = {
-        "batch_limit": BATCH_LIMIT,
-        "init_batch_limit": INIT_BATCH_LIMIT,
-        "method": "SLSQP",
-    }
-    if options is not None:
-        optimize_options.update(options)
-    X, expected_acquisition_value = optimize_acqf_list(
-        acq_function_list=acq_function_list,
-        bounds=bounds,
-        num_restarts=num_restarts,
-        raw_samples=50 * num_restarts if raw_samples is None else raw_samples,
-        options=optimize_options,
-        inequality_constraints=inequality_constraints,
-        fixed_features=fixed_features,
-        post_processing_func=rounding_func,
-    )
-    return X, expected_acquisition_value
-
-
-def pareto_frontier_evaluator(
-    model: TorchGenerator | None,
-    objective_weights: Tensor,
-    objective_thresholds: Tensor | None = None,
-    X: Tensor | None = None,
-    Y: Tensor | None = None,
-    Yvar: Tensor | None = None,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-) -> tuple[Tensor, Tensor, Tensor]:
-    """Return outcomes predicted to lie on a pareto frontier.
-
-    Given a model and points to evaluate, use the model to predict which points
-    lie on the Pareto frontier.
-
-    Args:
-        model: Model used to predict outcomes.
-        objective_weights: A `m` tensor of values indicating the weight to put
-            on different outcomes. For pareto frontiers only the sign matters.
-        objective_thresholds:  A tensor containing thresholds forming a reference point
-            from which to calculate pareto frontier hypervolume. Points that do not
-            dominate the objective_thresholds contribute nothing to hypervolume.
-        X: A `n x d` tensor of features to evaluate.
-        Y: A `n x m` tensor of outcomes to use instead of predictions.
-        Yvar: A `n x m x m` tensor of input covariances (NaN if unobserved).
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b.
-
-    Returns:
-        3-element tuple containing
-
-        - A `j x m` tensor of outcome on the pareto frontier. j is the number
-            of frontier points.
-        - A `j x m x m` tensor of predictive covariances.
-            cov[j, m1, m2] is Cov[m1@j, m2@j].
-        - A `j` tensor of the index of each frontier point in the input Y.
-    """
-    # TODO: better input validation, making more explicit whether we are using
-    # model predictions or not
-    if X is not None:
-        Y, Yvar = none_throws(model).predict(X)
-        # model.predict returns cpu tensors
-        Y = Y.to(X.device)
-        Yvar = Yvar.to(X.device)
-    elif Y is None or Yvar is None:
-        raise ValueError(
-            "Requires `X` to predict or both `Y` and `Yvar` to select a subset of "
-            "points on the pareto frontier."
-        )
-
-    # Apply objective_weights to outcomes and objective_thresholds.
-    # If objective_thresholds is not None use a dummy tensor of zeros.
-    (
-        obj,
-        weighted_objective_thresholds,
-    ) = get_weighted_mc_objective_and_objective_thresholds(
-        objective_weights=objective_weights,
-        objective_thresholds=(
-            objective_thresholds
-            if objective_thresholds is not None
-            else torch.zeros(
-                objective_weights.shape,
-                dtype=objective_weights.dtype,
-                device=objective_weights.device,
-            )
-        ),
-    )
-    Y_obj = obj(Y)
-    indx_frontier = torch.arange(Y.shape[0], dtype=torch.long, device=Y.device)
-
-    # Filter Y, Yvar, Y_obj to items that dominate all objective thresholds
-    if objective_thresholds is not None:
-        objective_thresholds_mask = torch.all(
-            Y_obj >= weighted_objective_thresholds, dim=1
-        )
-        Y = Y[objective_thresholds_mask]
-        Yvar = Yvar[objective_thresholds_mask]
-        Y_obj = Y_obj[objective_thresholds_mask]
-        indx_frontier = indx_frontier[objective_thresholds_mask]
-
-    # Get feasible points that do not violate outcome_constraints
-    if outcome_constraints is not None:
-        cons_tfs = get_outcome_constraint_transforms(outcome_constraints)
-        # Handle NaNs in Y, if those elements are not part of the constraints.
-        # By setting the unused elements to 0, we prevent them from marking
-        # the whole constraint value as NaN and evaluating to infeasible.
-        Y_cons = Y.clone()
-        Y_cons[..., (outcome_constraints[0] == 0).all(dim=0)] = 0
-        # pyre-ignore [16]
-        feas = torch.stack([c(Y_cons) <= 0 for c in cons_tfs], dim=-1).all(dim=-1)
-        Y = Y[feas]
-        Yvar = Yvar[feas]
-        Y_obj = Y_obj[feas]
-        indx_frontier = indx_frontier[feas]
-
-    if Y.shape[0] == 0:
-        # if there are no feasible points that are better than the reference point
-        # return empty tensors
-        return Y.cpu(), Yvar.cpu(), indx_frontier.cpu()
-
-    # calculate pareto front with only objective outcomes:
-    frontier_mask = is_non_dominated(Y_obj)
-
-    # Apply masks
-    Y_frontier = Y[frontier_mask]
-    Yvar_frontier = Yvar[frontier_mask]
-    indx_frontier = indx_frontier[frontier_mask]
-    return Y_frontier.cpu(), Yvar_frontier.cpu(), indx_frontier.cpu()
-
-
-def infer_objective_thresholds(
-    model: Model,
-    objective_weights: Tensor,  # objective_directions
-    X_observed: Tensor,
-    outcome_constraints: tuple[Tensor, Tensor] | None = None,
-    subset_idcs: Tensor | None = None,
-    objective_thresholds: Tensor | None = None,
-) -> Tensor:
-    """Infer objective thresholds.
-
-    This method uses the model-estimated Pareto frontier over the in-sample points
-    to infer absolute (not relativized) objective thresholds.
-
-    This uses a heuristic that sets the objective threshold to be a scaled nadir
-    point, where the nadir point is scaled back based on the range of each
-    objective across the current in-sample Pareto frontier.
-
-    See `botorch.utils.multi_objective.hypervolume.infer_reference_point` for
-    details on the heuristic.
-
-    Args:
-        model: A fitted botorch Model.
-        objective_weights: The objective is to maximize a weighted sum of
-            the columns of f(x). These are the weights. These should not
-            be subsetted.
-        X_observed: A `n x d`-dim tensor of in-sample points to use for
-            determining the current in-sample Pareto frontier.
-        outcome_constraints: A tuple of (A, b). For k outcome constraints
-            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
-            A f(x) <= b. These should not be subsetted.
-        subset_idcs: The indices of the outcomes that are modeled by the
-            provided model. If subset_idcs not None, this method infers
-            whether the model is subsetted.
-        objective_thresholds: Any known objective thresholds to pass to
-            `infer_reference_point` heuristic. This should not be subsetted.
-            If only a subset of the objectives have known thresholds, the
-            remaining objectives should be NaN. If no objective threshold
-            was provided, this can be `None`.
-
-    Returns:
-        A `m`-dim tensor of objective thresholds, where the objective
-            threshold is `nan` if the outcome is not an objective.
-    """
-    num_outcomes = objective_weights.shape[0]
-    if subset_idcs is None:
-        # Subset the model so that we only compute the posterior
-        # over the relevant outcomes.
-        # This is a no-op if the model is already only modeling
-        # the relevant outcomes.
-        subset_model_results = subset_model(
-            model=model,
-            objective_weights=objective_weights,
-            outcome_constraints=outcome_constraints,
-        )
-        model = subset_model_results.model
-        objective_weights = subset_model_results.objective_weights
-        outcome_constraints = subset_model_results.outcome_constraints
-        subset_idcs = subset_model_results.indices
-    else:
-        objective_weights = objective_weights[subset_idcs]
-        if outcome_constraints is not None:
-            outcome_constraints = (
-                outcome_constraints[0][:, subset_idcs],
-                outcome_constraints[1],
-            )
-    with torch.no_grad():
-        pred = _check_posterior_type(
-            none_throws(model).posterior(none_throws(X_observed))
-        ).mean
-
-    if outcome_constraints is not None:
-        cons_tfs = get_outcome_constraint_transforms(outcome_constraints)
-        # pyre-ignore [16]
-        feas = torch.stack([c(pred) <= 0 for c in cons_tfs], dim=-1).all(dim=-1)
-        pred = pred[feas]
-    if pred.shape[0] == 0:
-        raise AxError(NO_FEASIBLE_POINTS_MESSAGE)
-    obj_mask = objective_weights.nonzero().view(-1)
-    obj_weights_subset = objective_weights[obj_mask]
-    obj = pred[..., obj_mask] * obj_weights_subset
-    pareto_obj = obj[is_non_dominated(obj)]
-    # If objective thresholds are provided, set max_ref_point accordingly.
-    if objective_thresholds is not None:
-        max_ref_point = objective_thresholds[obj_mask] * obj_weights_subset
-    else:
-        max_ref_point = None
-    objective_thresholds = infer_reference_point(
-        pareto_Y=pareto_obj,
-        max_ref_point=max_ref_point,
-        scale=0.1,
-    )
-    # multiply by objective weights to return objective thresholds in the
-    # unweighted space
-    objective_thresholds = objective_thresholds * obj_weights_subset
-    full_objective_thresholds = torch.full(
-        (num_outcomes,),
-        float("nan"),
-        dtype=objective_weights.dtype,
-        device=objective_weights.device,
-    )
-    obj_idcs = subset_idcs[obj_mask]
-    full_objective_thresholds[obj_idcs] = objective_thresholds.clone()
-    return full_objective_thresholds
-
-
-def _check_posterior_type(
-    posterior: Posterior,
-) -> GPyTorchPosterior | PosteriorList:
-    """Check whether the posterior type is  `GPyTorchPosterior` or `PosteriorList`."""
-    if isinstance(posterior, GPyTorchPosterior) or isinstance(posterior, PosteriorList):
-        return posterior
-    else:
-        raise ValueError(
-            f"Value was not of type GPyTorchPosterior or PosteriorList:\n{posterior}"
-        )
diff --git a/ax/generators/torch/botorch_moo_utils.py b/ax/generators/torch/botorch_moo_utils.py
new file mode 100644
index 00000000000..86a102e3602
--- /dev/null
+++ b/ax/generators/torch/botorch_moo_utils.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+"""
+References
+
+.. [Daulton2020qehvi]
+    S. Daulton, M. Balandat, and E. Bakshy. Differentiable Expected Hypervolume
+    Improvement for Parallel Multi-Objective Bayesian Optimization. Advances in Neural
+    Information Processing Systems 33, 2020.
+
+.. [Daulton2021nehvi]
+    S. Daulton, M. Balandat, and E. Bakshy. Parallel Bayesian Optimization of
+    Multiple Noisy Objectives with Expected Hypervolume Improvement. Advances
+    in Neural Information Processing Systems 34, 2021.
+
+.. [Ament2023logei]
+    S. Ament, S. Daulton, D. Eriksson, M. Balandat, and E. Bakshy.
+    Unexpected Improvements to Expected Improvement for Bayesian Optimization. Advances
+    in Neural Information Processing Systems 36, 2023.
+"""
+
+from __future__ import annotations
+
+import torch
+from ax.exceptions.core import AxError
+from ax.generators.torch.utils import subset_model
+from ax.generators.torch_base import TorchGenerator
+from botorch.acquisition.multi_objective.objective import WeightedMCMultiOutputObjective
+from botorch.models.model import Model
+from botorch.posteriors.gpytorch import GPyTorchPosterior
+from botorch.posteriors.posterior import Posterior
+from botorch.posteriors.posterior_list import PosteriorList
+from botorch.utils.constraints import get_outcome_constraint_transforms
+from botorch.utils.multi_objective.hypervolume import infer_reference_point
+from botorch.utils.multi_objective.pareto import is_non_dominated
+from pyre_extensions import none_throws
+from torch import Tensor
+
+
+NO_FEASIBLE_POINTS_MESSAGE = (
+    " Cannot infer objective thresholds due to no observed feasible points. "
+    " This likely means that one or more outcome constraints is set too strictly.  "
+    " Consider adding thresholds to your objectives to bypass this error."
+)
+
+
+def get_weighted_mc_objective_and_objective_thresholds(
+    objective_weights: Tensor, objective_thresholds: Tensor
+) -> tuple[WeightedMCMultiOutputObjective, Tensor]:
+    r"""Construct weighted objective and apply the weights to objective thresholds.
+
+    Args:
+        objective_weights: The objective is to maximize a weighted sum of
+            the columns of f(x). These are the weights.
+        objective_thresholds: A tensor containing thresholds forming a reference point
+            from which to calculate pareto frontier hypervolume. Points that do not
+            dominate the objective_thresholds contribute nothing to hypervolume.
+
+    Returns:
+        A two-element tuple with the objective and objective thresholds:
+
+            - The objective
+            - The objective thresholds
+
+    """
+    nonzero_idcs = objective_weights.nonzero(as_tuple=False).view(-1)
+    objective_weights = objective_weights[nonzero_idcs]
+    objective_thresholds = objective_thresholds[nonzero_idcs]
+    objective = WeightedMCMultiOutputObjective(
+        weights=objective_weights, outcomes=nonzero_idcs.tolist()
+    )
+    objective_thresholds = torch.mul(objective_thresholds, objective_weights)
+    return objective, objective_thresholds
+
+
+def pareto_frontier_evaluator(
+    model: TorchGenerator | None,
+    objective_weights: Tensor,
+    objective_thresholds: Tensor | None = None,
+    X: Tensor | None = None,
+    Y: Tensor | None = None,
+    Yvar: Tensor | None = None,
+    outcome_constraints: tuple[Tensor, Tensor] | None = None,
+) -> tuple[Tensor, Tensor, Tensor]:
+    """Return outcomes predicted to lie on a pareto frontier.
+
+    Given a model and points to evaluate, use the model to predict which points
+    lie on the Pareto frontier.
+
+    Args:
+        model: Model used to predict outcomes.
+        objective_weights: A `m` tensor of values indicating the weight to put
+            on different outcomes. For pareto frontiers only the sign matters.
+        objective_thresholds:  A tensor containing thresholds forming a reference point
+            from which to calculate pareto frontier hypervolume. Points that do not
+            dominate the objective_thresholds contribute nothing to hypervolume.
+        X: A `n x d` tensor of features to evaluate.
+        Y: A `n x m` tensor of outcomes to use instead of predictions.
+        Yvar: A `n x m x m` tensor of input covariances (NaN if unobserved).
+        outcome_constraints: A tuple of (A, b). For k outcome constraints
+            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
+            A f(x) <= b.
+
+    Returns:
+        3-element tuple containing
+
+        - A `j x m` tensor of outcome on the pareto frontier. j is the number
+            of frontier points.
+        - A `j x m x m` tensor of predictive covariances.
+            cov[j, m1, m2] is Cov[m1@j, m2@j].
+        - A `j` tensor of the index of each frontier point in the input Y.
+    """
+    # TODO: better input validation, making more explicit whether we are using
+    # model predictions or not
+    if X is not None:
+        Y, Yvar = none_throws(model).predict(X)
+        # model.predict returns cpu tensors
+        Y = Y.to(X.device)
+        Yvar = Yvar.to(X.device)
+    elif Y is None or Yvar is None:
+        raise ValueError(
+            "Requires `X` to predict or both `Y` and `Yvar` to select a subset of "
+            "points on the pareto frontier."
+        )
+
+    # Apply objective_weights to outcomes and objective_thresholds.
+    # If objective_thresholds is not None use a dummy tensor of zeros.
+    (
+        obj,
+        weighted_objective_thresholds,
+    ) = get_weighted_mc_objective_and_objective_thresholds(
+        objective_weights=objective_weights,
+        objective_thresholds=(
+            objective_thresholds
+            if objective_thresholds is not None
+            else torch.zeros(
+                objective_weights.shape,
+                dtype=objective_weights.dtype,
+                device=objective_weights.device,
+            )
+        ),
+    )
+    Y_obj = obj(Y)
+    indx_frontier = torch.arange(Y.shape[0], dtype=torch.long, device=Y.device)
+
+    # Filter Y, Yvar, Y_obj to items that dominate all objective thresholds
+    if objective_thresholds is not None:
+        objective_thresholds_mask = torch.all(
+            Y_obj >= weighted_objective_thresholds, dim=1
+        )
+        Y = Y[objective_thresholds_mask]
+        Yvar = Yvar[objective_thresholds_mask]
+        Y_obj = Y_obj[objective_thresholds_mask]
+        indx_frontier = indx_frontier[objective_thresholds_mask]
+
+    # Get feasible points that do not violate outcome_constraints
+    if outcome_constraints is not None:
+        cons_tfs = get_outcome_constraint_transforms(outcome_constraints)
+        # Handle NaNs in Y, if those elements are not part of the constraints.
+        # By setting the unused elements to 0, we prevent them from marking
+        # the whole constraint value as NaN and evaluating to infeasible.
+        Y_cons = Y.clone()
+        Y_cons[..., (outcome_constraints[0] == 0).all(dim=0)] = 0
+        # pyre-ignore [16]
+        feas = torch.stack([c(Y_cons) <= 0 for c in cons_tfs], dim=-1).all(dim=-1)
+        Y = Y[feas]
+        Yvar = Yvar[feas]
+        Y_obj = Y_obj[feas]
+        indx_frontier = indx_frontier[feas]
+
+    if Y.shape[0] == 0:
+        # if there are no feasible points that are better than the reference point
+        # return empty tensors
+        return Y.cpu(), Yvar.cpu(), indx_frontier.cpu()
+
+    # calculate pareto front with only objective outcomes:
+    frontier_mask = is_non_dominated(Y_obj)
+
+    # Apply masks
+    Y_frontier = Y[frontier_mask]
+    Yvar_frontier = Yvar[frontier_mask]
+    indx_frontier = indx_frontier[frontier_mask]
+    return Y_frontier.cpu(), Yvar_frontier.cpu(), indx_frontier.cpu()
+
+
+def infer_objective_thresholds(
+    model: Model,
+    objective_weights: Tensor,  # objective_directions
+    X_observed: Tensor,
+    outcome_constraints: tuple[Tensor, Tensor] | None = None,
+    subset_idcs: Tensor | None = None,
+    objective_thresholds: Tensor | None = None,
+) -> Tensor:
+    """Infer objective thresholds.
+
+    This method uses the model-estimated Pareto frontier over the in-sample points
+    to infer absolute (not relativized) objective thresholds.
+
+    This uses a heuristic that sets the objective threshold to be a scaled nadir
+    point, where the nadir point is scaled back based on the range of each
+    objective across the current in-sample Pareto frontier.
+
+    See `botorch.utils.multi_objective.hypervolume.infer_reference_point` for
+    details on the heuristic.
+
+    Args:
+        model: A fitted botorch Model.
+        objective_weights: The objective is to maximize a weighted sum of
+            the columns of f(x). These are the weights. These should not
+            be subsetted.
+        X_observed: A `n x d`-dim tensor of in-sample points to use for
+            determining the current in-sample Pareto frontier.
+        outcome_constraints: A tuple of (A, b). For k outcome constraints
+            and m outputs at f(x), A is (k x m) and b is (k x 1) such that
+            A f(x) <= b. These should not be subsetted.
+        subset_idcs: The indices of the outcomes that are modeled by the
+            provided model. If subset_idcs not None, this method infers
+            whether the model is subsetted.
+        objective_thresholds: Any known objective thresholds to pass to
+            `infer_reference_point` heuristic. This should not be subsetted.
+            If only a subset of the objectives have known thresholds, the
+            remaining objectives should be NaN. If no objective threshold
+            was provided, this can be `None`.
+
+    Returns:
+        A `m`-dim tensor of objective thresholds, where the objective
+            threshold is `nan` if the outcome is not an objective.
+    """
+    num_outcomes = objective_weights.shape[0]
+    if subset_idcs is None:
+        # Subset the model so that we only compute the posterior
+        # over the relevant outcomes.
+        # This is a no-op if the model is already only modeling
+        # the relevant outcomes.
+        subset_model_results = subset_model(
+            model=model,
+            objective_weights=objective_weights,
+            outcome_constraints=outcome_constraints,
+        )
+        model = subset_model_results.model
+        objective_weights = subset_model_results.objective_weights
+        outcome_constraints = subset_model_results.outcome_constraints
+        subset_idcs = subset_model_results.indices
+    else:
+        objective_weights = objective_weights[subset_idcs]
+        if outcome_constraints is not None:
+            outcome_constraints = (
+                outcome_constraints[0][:, subset_idcs],
+                outcome_constraints[1],
+            )
+    with torch.no_grad():
+        pred = _check_posterior_type(
+            none_throws(model).posterior(none_throws(X_observed))
+        ).mean
+
+    if outcome_constraints is not None:
+        cons_tfs = get_outcome_constraint_transforms(outcome_constraints)
+        # pyre-ignore [16]
+        feas = torch.stack([c(pred) <= 0 for c in cons_tfs], dim=-1).all(dim=-1)
+        pred = pred[feas]
+    if pred.shape[0] == 0:
+        raise AxError(NO_FEASIBLE_POINTS_MESSAGE)
+    obj_mask = objective_weights.nonzero().view(-1)
+    obj_weights_subset = objective_weights[obj_mask]
+    obj = pred[..., obj_mask] * obj_weights_subset
+    pareto_obj = obj[is_non_dominated(obj)]
+    # If objective thresholds are provided, set max_ref_point accordingly.
+    if objective_thresholds is not None:
+        max_ref_point = objective_thresholds[obj_mask] * obj_weights_subset
+    else:
+        max_ref_point = None
+    objective_thresholds = infer_reference_point(
+        pareto_Y=pareto_obj,
+        max_ref_point=max_ref_point,
+        scale=0.1,
+    )
+    # multiply by objective weights to return objective thresholds in the
+    # unweighted space
+    objective_thresholds = objective_thresholds * obj_weights_subset
+    full_objective_thresholds = torch.full(
+        (num_outcomes,),
+        float("nan"),
+        dtype=objective_weights.dtype,
+        device=objective_weights.device,
+    )
+    obj_idcs = subset_idcs[obj_mask]
+    full_objective_thresholds[obj_idcs] = objective_thresholds.clone()
+    return full_objective_thresholds
+
+
+def _check_posterior_type(
+    posterior: Posterior,
+) -> GPyTorchPosterior | PosteriorList:
+    """Check whether the posterior type is  `GPyTorchPosterior` or `PosteriorList`."""
+    if isinstance(posterior, GPyTorchPosterior) or isinstance(posterior, PosteriorList):
+        return posterior
+    else:
+        raise ValueError(
+            f"Value was not of type GPyTorchPosterior or PosteriorList:\n{posterior}"
+        )
diff --git a/ax/generators/torch/tests/test_acquisition.py b/ax/generators/torch/tests/test_acquisition.py
index 8fa5a44178a..733443d2766 100644
--- a/ax/generators/torch/tests/test_acquisition.py
+++ b/ax/generators/torch/tests/test_acquisition.py
@@ -916,7 +916,7 @@ def test_evaluate(self, mock_call: Mock, mock_evaluate: Mock) -> None:
 
     @mock_botorch_optimize
     @mock.patch(  # pyre-ignore
-        "ax.generators.torch.botorch_moo_defaults._check_posterior_type",
+        "ax.generators.torch.botorch_moo_utils._check_posterior_type",
         wraps=lambda y: y,
     )
     @mock.patch(f"{ACQUISITION_PATH}._get_X_pending_and_observed")
diff --git a/ax/generators/torch/utils.py b/ax/generators/torch/utils.py
index b8333ea07af..5103c356227 100644
--- a/ax/generators/torch/utils.py
+++ b/ax/generators/torch/utils.py
@@ -10,17 +10,13 @@
 from dataclasses import dataclass
 from typing import Any, cast
 
-import numpy as np
 import numpy.typing as npt
 import torch
-from ax.core.search_space import SearchSpaceDigest
 from ax.exceptions.core import UnsupportedError
 from ax.generators.model_utils import (
     filter_constraints_and_fixed_features,
     get_observed,
 )
-from ax.generators.random.sobol import SobolGenerator
-from ax.generators.types import TConfig
 from ax.utils.common.constants import Keys
 from botorch.acquisition.acquisition import AcquisitionFunction
 from botorch.acquisition.analytic import PosteriorMean
@@ -60,7 +56,7 @@
 from botorch.utils.constraints import get_outcome_constraint_transforms
 from botorch.utils.datasets import SupervisedDataset
 from botorch.utils.objective import get_objective_weights_transform
-from botorch.utils.sampling import sample_hypersphere, sample_simplex
+from botorch.utils.sampling import sample_simplex
 from botorch.utils.transforms import is_ensemble
 from torch import Tensor
 from torch.nn import ModuleList  # @manual
@@ -202,41 +198,6 @@ def _get_X_pending_and_observed(
         return X_pending, unfiltered_X_observed
 
 
-def _generate_sobol_points(
-    n_sobol: int,
-    search_space_digest: SearchSpaceDigest,
-    device: torch.device,
-    linear_constraints: tuple[Tensor, Tensor] | None = None,
-    fixed_features: dict[int, float] | None = None,
-    rounding_func: Callable[[Tensor], Tensor] | None = None,
-    model_gen_options: TConfig | None = None,
-) -> Tensor:
-    linear_constraints_array = None
-
-    if linear_constraints is not None:
-        linear_constraints_array = (
-            linear_constraints[0].detach().cpu().numpy(),
-            linear_constraints[1].detach().cpu().numpy(),
-        )
-
-    array_rounding_func = None
-    if rounding_func is not None:
-        array_rounding_func = tensor_callable_to_array_callable(
-            tensor_func=rounding_func, device=device
-        )
-
-    sobol = SobolGenerator(deduplicate=False, seed=np.random.randint(10000))
-    array_X, _ = sobol.gen(
-        n=n_sobol,
-        search_space_digest=search_space_digest,
-        linear_constraints=linear_constraints_array,
-        fixed_features=fixed_features,
-        rounding_func=array_rounding_func,
-        model_gen_options=model_gen_options,
-    )
-    return torch.from_numpy(array_X).to(device)
-
-
 def subset_model(
     model: Model,
     objective_weights: Tensor,
@@ -560,39 +521,6 @@ def predict_from_model(
     return mean, cov
 
 
-# TODO(jej): Possibly refactor to use "objective_directions".
-def randomize_objective_weights(
-    objective_weights: Tensor,
-    random_scalarization_distribution: str = SIMPLEX,
-) -> Tensor:
-    """Generate a random weighting based on acquisition function settings.
-
-    Args:
-        objective_weights: Base weights to multiply by random values.
-        random_scalarization_distribution: "simplex" or "hypersphere".
-
-    Returns:
-        A normalized list of indices such that each index is between `0` and `d-1`.
-    """
-    # Set distribution and sample weights.
-    distribution = random_scalarization_distribution
-    dtype = objective_weights.dtype
-    device = objective_weights.device
-    if distribution == SIMPLEX:
-        random_weights = sample_simplex(
-            len(objective_weights), dtype=dtype, device=device
-        ).squeeze()
-    elif distribution == HYPERSPHERE:
-        random_weights = torch.abs(
-            sample_hypersphere(
-                len(objective_weights), dtype=dtype, device=device
-            ).squeeze()
-        )
-    # pyre-fixme[61]: `random_weights` may not be initialized here.
-    objective_weights = torch.mul(objective_weights, random_weights)
-    return objective_weights
-
-
 def _datasets_to_legacy_inputs(
     datasets: Sequence[SupervisedDataset],
 ) -> tuple[list[Tensor], list[Tensor], list[Tensor]]: