scikit-hep · matthewfeickert · Jun 22, 2022 · Apr 8, 2022 · Apr 20, 2022 · Apr 20, 2022
diff --git a/src/pyhf/pdf.py b/src/pyhf/pdf.py
@@ -2,7 +2,7 @@
 
 import copy
 import logging
-from typing import List
+from typing import List, Union
 
 import pyhf.parameters
 import pyhf
@@ -506,7 +506,15 @@ def logpdf(self, auxdata, pars):
 class _MainModel:
     """Factory class to create pdfs for the main measurement."""
 
-    def __init__(self, config, modifiers, nominal_rates, batch_size=None):
+    def __init__(
+        self,
+        config,
+        modifiers,
+        nominal_rates,
+        batch_size=None,
+        clip_sample_data: Union[float, None] = None,
+        clip_bin_data: Union[float, None] = None,
+    ):
         default_backend = pyhf.default_backend
 
         self.config = config
@@ -515,6 +523,17 @@ def __init__(self, config, modifiers, nominal_rates, batch_size=None):
         self._delta_mods = []
         self.batch_size = batch_size
 
+        self.clip_sample_data = clip_sample_data
+        self.clip_bin_data = clip_bin_data
+
+        if self.clip_sample_data is not None:
+            log.warning(
+                f"Clipping expected data per-bin for each sample below {self.clip_sample_data}"
+            )
+
+        if self.clip_bin_data is not None:
+            log.warning(f"Clipping expected data per-bin below {self.clip_bin_data}")
+
         self._nominal_rates = default_backend.tile(
             nominal_rates, (1, 1, self.batch_size or 1, 1)
         )
@@ -619,13 +638,21 @@ def expected_data(self, pars, return_by_sample=False):
         allfac = tensorlib.concatenate(factors + [nom_plus_delta])
 
         newbysample = tensorlib.product(allfac, axis=0)
+        if self.clip_sample_data is not None:
+            newbysample = tensorlib.clip(
+                newbysample, self.clip_sample_data, max_value=None
+            )
+
         if return_by_sample:
             batch_first = tensorlib.einsum('ij...->ji...', newbysample)
             if self.batch_size is None:
                 return batch_first[0]
             return batch_first
 
         newresults = tensorlib.sum(newbysample, axis=0)
+        if self.clip_bin_data is not None:
+            newresults = tensorlib.clip(newresults, self.clip_bin_data, max_value=None)
+
         if self.batch_size is None:
             return newresults[0]
         return newresults
@@ -640,6 +667,8 @@ def __init__(
         modifier_set=None,
         batch_size=None,
         validate: bool = True,
+        clip_sample_data: Union[float, None] = None,
+        clip_bin_data: Union[float, None] = None,
         **config_kwargs,
     ):
         """
@@ -650,6 +679,8 @@ def __init__(
             batch_size (:obj:`None` or :obj:`int`): Number of simultaneous (batched)
              Models to compute.
             validate (:obj:`bool`): Whether to validate against a JSON schema
+            clip_sample_data (:obj:`None` or :obj:`float`): Clip the minimum value of expected data by-sample. Default is no clipping.
+            clip_bin_data (:obj:`None` or :obj:`float`): Clip the minimum value of expected data by-bin. Default is no clipping.
             config_kwargs: Possible keyword arguments for the model configuration
 
         Returns:
@@ -684,6 +715,8 @@ def __init__(
             modifiers=modifiers,
             nominal_rates=_nominal_rates,
             batch_size=self.batch_size,
+            clip_sample_data=clip_sample_data,
+            clip_bin_data=clip_bin_data,
         )
 
         # the below call needs auxdata order for example
@@ -794,8 +827,6 @@ def make_pdf(self, pars):
             pdf: A distribution object implementing the main measurement pdf of HistFactory
 
         """
-        tensorlib, _ = get_backend()
-
         pdfobjs = []
         mainpdf = self.main_model.make_pdf(pars)
         if mainpdf:

diff --git a/tests/test_pdf.py b/tests/test_pdf.py
@@ -1070,3 +1070,115 @@ def make_model(
         pyhf.tensorlib.astensor(1.05),
         pyhf.tensorlib.astensor([5.0, 5.0]),
     )
+
+
+def test_pdf_clipping(backend):
+    tensorlib, optimizer = pyhf.get_backend()
+
+    spec = {
+        "channels": [
+            {
+                "name": "ch1",
+                "samples": [
+                    {
+                        "data": [100.0, 100.0],
+                        "modifiers": [
+                            {"data": None, "name": "mu_sig", "type": "normfactor"},
+                            {
+                                "data": {
+                                    "hi_data": [125.0, 75.0],
+                                    "lo_data": [175.0, 10.0],
+                                },
+                                "name": "np_1",
+                                "type": "histosys",
+                            },
+                            {
+                                "data": {
+                                    "hi_data": [125.0, 75.0],
+                                    "lo_data": [175.0, 10.0],
+                                },
+                                "name": "np_2",
+                                "type": "histosys",
+                            },
+                        ],
+                        "name": "signal",
+                    }
+                ],
+            },
+            {
+                "name": "ch2",
+                "samples": [
+                    {
+                        "data": [15000.0],
+                        "modifiers": [
+                            {"data": None, "name": "mu_sig", "type": "normfactor"},
+                            {
+                                "data": {"hi_data": [15500.0], "lo_data": [1200.0]},
+                                "name": "np_1",
+                                "type": "histosys",
+                            },
+                            {
+                                "data": {"hi_data": [15500.0], "lo_data": [1200.0]},
+                                "name": "np_2",
+                                "type": "histosys",
+                            },
+                        ],
+                        "name": "signal",
+                    }
+                ],
+            },
+        ],
+        "measurements": [
+            {"config": {"parameters": [], "poi": "mu_sig"}, "name": "meas"}
+        ],
+        "observations": [
+            {"data": [100], "name": "ch1"},
+            {"data": [1000], "name": "ch2"},
+        ],
+        "version": "1.0.0",
+    }
+
+    par_values = []
+
+    ws = pyhf.Workspace(spec)
+    model = ws.model()
+    data = tensorlib.astensor([100.0, 100.0, 10.0, 0.0, 0.0])
+
+    for par_name in model.config.par_names():
+        if "np" in par_name:
+            par_values.append(-0.6)  # np_1 / np_2
+        else:
+            par_values.append(1.0)  # mu
+
+    pars = tensorlib.astensor(par_values)
+
+    # Check with no clipping
+    assert any(value < 0 for value in model.expected_actualdata(pars))
+    assert tensorlib.tolist(model.expected_actualdata(pars)) == pytest.approx(
+        [1.830496e02, -7.040000e-03, -6.355968e02], abs=1e-3
+    )
+
+    # Check with clipping by-sample
+    model_clip_sample = ws.model(clip_sample_data=0.0)
+    assert all(value >= 0 for value in model_clip_sample.expected_actualdata(pars))
+    assert tensorlib.tolist(
+        model_clip_sample.expected_actualdata(pars)
+    ) == pytest.approx([1.830496e02, 0.0, 0.0], abs=1e-3)
+
+    # Check with clipping by-bin
+    model_clip_bin = ws.model(clip_bin_data=0.0)
+    assert all(value >= 0 for value in model_clip_bin.expected_actualdata(pars))
+    assert tensorlib.tolist(model_clip_bin.expected_actualdata(pars)) == pytest.approx(
+        [1.830496e02, 0.0, 0.0], abs=1e-3
+    )
+
+    # Minuit cannot handle negative yields, confirm that MLE fails for minuit specifically
+    if optimizer.name == 'minuit':
+        with pytest.raises(pyhf.exceptions.FailedMinimization):
+            pyhf.infer.mle.fit(data, model)
+    else:
+        pyhf.infer.mle.fit(data, model)
+
+    # We should be able to converge when clipping is enabled
+    pyhf.infer.mle.fit(data, model_clip_sample)
+    pyhf.infer.mle.fit(data, model_clip_bin)