Fit PDF with conditional variable

In this example, we show an unusual fit where the total sample is not drawn form a single probability distribution, but each individual sample \(x\) is drawn from a different distribution, whose parameters are determined by a conditional variable \(y\).

In our example, we are drawing samples \(x\) from varying Gaussian distributions. The location of each Gaussian is a function of the conditional variable \(y\), but all share the same width parameter \(\sigma\). We fit the shared parameter \(\sigma\), but also the parameters \(a\) and \(b\) which determine how the location of each gaussian depends on \(y\), assuming a line function \(\mu = a + b y\).

This tutorial reproduces a corresponding one from RooFit.

[1]:
import iminuit
from iminuit.cost import UnbinnedNLL
from iminuit import Minuit
import numpy as np
import numba as nb
import boost_histogram as bh
import matplotlib.pyplot as plt
from scipy.stats import norm
from numba_stats import norm as norm_nb
print("iminuit version", iminuit.__version__)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [1], in <cell line: 8>()
      6 import boost_histogram as bh
      7 import matplotlib.pyplot as plt
----> 8 from scipy.stats import norm
      9 from numba_stats import norm as norm_nb
     10 print("iminuit version", iminuit.__version__)

File /usr/lib/python3.10/site-packages/scipy/stats/__init__.py:467, in <module>
      1 """
      2 .. _statsrefmanual:
      3
   (...)
    462
    463 """
    465 from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
    466                                DegenerateDataWarning, FitError)
--> 467 from ._stats_py import *
    468 from ._variation import variation
    469 from .distributions import *

File /usr/lib/python3.10/site-packages/scipy/stats/_stats_py.py:39, in <module>
     36 from numpy.lib import NumpyVersion
     37 from numpy.testing import suppress_warnings
---> 39 from scipy.spatial.distance import cdist
     40 from scipy.ndimage import _measurements
     41 from scipy._lib._util import (check_random_state, MapWrapper,
     42                               rng_integers, _rename_parameter)

File /usr/lib/python3.10/site-packages/scipy/spatial/__init__.py:105, in <module>
      1 """
      2 =============================================================
      3 Spatial algorithms and data structures (:mod:`scipy.spatial`)
   (...)
    102    QhullError
    103 """
--> 105 from ._kdtree import *
    106 from ._ckdtree import *
    107 from ._qhull import *

File /usr/lib/python3.10/site-packages/scipy/spatial/_kdtree.py:5, in <module>
      3 import numpy as np
      4 import warnings
----> 5 from ._ckdtree import cKDTree, cKDTreeNode
      7 __all__ = ['minkowski_distance_p', 'minkowski_distance',
      8            'distance_matrix',
      9            'Rectangle', 'KDTree']
     12 def minkowski_distance_p(x, y, p=2):

File _ckdtree.pyx:10, in init scipy.spatial._ckdtree()

File /usr/lib/python3.10/site-packages/scipy/sparse/__init__.py:267, in <module>
    264 import warnings as _warnings
    266 from ._base import *
--> 267 from ._csr import *
    268 from ._csc import *
    269 from ._lil import *

File /usr/lib/python3.10/site-packages/scipy/sparse/_csr.py:10, in <module>
      7 import numpy as np
      9 from ._base import spmatrix
---> 10 from ._sparsetools import (csr_tocsc, csr_tobsr, csr_count_blocks,
     11                            get_csr_submatrix)
     12 from ._sputils import upcast, get_index_dtype
     14 from ._compressed import _cs_matrix

ImportError: numpy.core.multiarray failed to import
[2]:
rng = np.random.default_rng(1)

# conditional variable: each sample is paired with a random y parameter
y = rng.normal(0, 10, size=10000)
y = y[np.abs(y) < 10]  # truncate at 10

# location of each gaussian is a function of y
def mu(y, a, b):
    return a + b * y

# draw samples from Gaussians whose locations depend on y
truth = {"a": 0, "b": 0.5, "sigma": 1.0}
x = rng.normal(mu(y, truth["a"], truth["b"]), truth["sigma"])

The distribution in \(x\) is more broad than the usual Gaussian because it is a convolution of many Gaussian distributions with varying means. We can visualise this by binning the data in \(x\) and \(y\).

[3]:
ax_x = bh.axis.Regular(100, -10, 10)
ax_y = bh.axis.Regular(5, -10, 10)
h = bh.Histogram(ax_x, ax_y)
h.fill(x, y)
for i, (a, b) in enumerate(ax_y):
    plt.stairs(h.values()[:,i], ax_x.edges, label=f"[{a}, {b})",
               fill=True, alpha=0.2)
h1 = h[:, sum]
plt.stairs(h1.values(), ax_x.edges, color="k", label="total")
plt.xlabel("x")
plt.ylabel("events")
plt.legend(title="y interval", frameon=False, handlelength=1.2);
../_images/notebooks_conditional_variable_4_0.svg

Fit with conditional variable

The random distribution of \(x\) depends on the value of \(y\). We can exploit that information in the likelihood function to obtain a more accurate estimate of the parameters.

[4]:
def model(xy, a, b, sigma):
    x, y = xy
    mu = a + b * y
    # cannot use norm.pdf from numba_stats here, because it is not vectorized in mu
    return norm.pdf(x, mu, sigma)

nll = UnbinnedNLL((x, y), model)

m = Minuit(nll, 0.0, 0.0, 2.0)
m.limits["sigma"] = (0, None)
m.migrad()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [4], in <cell line: 11>()
      9 m = Minuit(nll, 0.0, 0.0, 2.0)
     10 m.limits["sigma"] = (0, None)
---> 11 m.migrad()

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/minuit.py:694, in Minuit.migrad(self, ncall, iterate)
    692 if self._precision is not None:
    693     migrad.precision = self._precision
--> 694 fm = migrad(ncall, self._tolerance)
    695 if fm.is_valid or fm.has_reached_call_limit:
    696     break

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:484, in Cost.__call__(self, *args)
    469 def __call__(self, *args):
    470     """
    471     Evaluate the cost function.
    472
   (...)
    482     float
    483     """
--> 484     r = self._call(args)
    485     if self.verbose >= 1:
    486         print(args, "->", r)

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:787, in UnbinnedNLL._call(self, args)
    785 def _call(self, args):
    786     data = self._masked
--> 787     x = self._model(data, *args)
    788     x = _normalize_model_output(x)
    789     if self._log:

Input In [4], in model(xy, a, b, sigma)
      3 mu = a + b * y
      4 # cannot use norm.pdf from numba_stats here, because it is not vectorized in mu
----> 5 return norm.pdf(x, mu, sigma)

NameError: name 'norm' is not defined
[5]:
# construct model representation for comparison with data histogram
a, b, sigma = m.values

# get expected content per bin from cdf, sum over the individual cdfs
v = np.diff(np.sum(norm.cdf(ax_x.edges[:,np.newaxis],
                            mu(y, a, b), sigma), axis=1))

plt.stairs(v, ax_x.edges, label="model", zorder=5, lw=2)
plt.errorbar(ax_x.centers, h1.values(), h1.variances() ** 0.5,
             fmt="ok", label="data")
plt.xlabel("x")
plt.ylabel("events")
plt.legend(frameon=False);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Input In [5], in <cell line: 5>()
      2 a, b, sigma = m.values
      4 # get expected content per bin from cdf, sum over the individual cdfs
----> 5 v = np.diff(np.sum(norm.cdf(ax_x.edges[:,np.newaxis],
      6                             mu(y, a, b), sigma), axis=1))
      8 plt.stairs(v, ax_x.edges, label="model", zorder=5, lw=2)
      9 plt.errorbar(ax_x.centers, h1.values(), h1.variances() ** 0.5,
     10              fmt="ok", label="data")

NameError: name 'norm' is not defined

Fit without conditional variable

We can also ignore the dependence of \(x\) and \(y\) and just fit the total \(x\) distribution with a model built from the distribution of \(y\) values. This also works in this case, but information is lost and therefore the parameter uncertainties become larger than in the previous case.

On top of that, the calculation is much slower, because building the pdf is more expensive. We parallelise the computation with numba.

[6]:
nb.config.THREADING_LAYER = 'workqueue'


@nb.njit(parallel=True, fastmath=True)
def model(x, a, b, sigma):
    mu = a + b * y
    total = np.zeros_like(x)
    for i in nb.prange(len(mu)):
        total += norm_nb.pdf(x, mu[i], sigma)
    return total


nll = UnbinnedNLL(x, model)
m2 = Minuit(nll, 0.0, 0.0, 2.0)
m2.limits["sigma"] = (0, None)
m2.migrad()
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf
---------------------------------------------------------------------------
TypingError                               Traceback (most recent call last)
Input In [6], in <cell line: 16>()
     14 m2 = Minuit(nll, 0.0, 0.0, 2.0)
     15 m2.limits["sigma"] = (0, None)
---> 16 m2.migrad()

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/minuit.py:694, in Minuit.migrad(self, ncall, iterate)
    692 if self._precision is not None:
    693     migrad.precision = self._precision
--> 694 fm = migrad(ncall, self._tolerance)
    695 if fm.is_valid or fm.has_reached_call_limit:
    696     break

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:484, in Cost.__call__(self, *args)
    469 def __call__(self, *args):
    470     """
    471     Evaluate the cost function.
    472
   (...)
    482     float
    483     """
--> 484     r = self._call(args)
    485     if self.verbose >= 1:
    486         print(args, "->", r)

File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:787, in UnbinnedNLL._call(self, args)
    785 def _call(self, args):
    786     data = self._masked
--> 787     x = self._model(data, *args)
    788     x = _normalize_model_output(x)
    789     if self._log:

File ~/python-iminuit/src/python-iminuit/test-env/lib/python3.10/site-packages/numba/core/dispatcher.py:468, in _DispatcherBase._compile_for_args(self, *args, **kws)
    464         msg = (f"{str(e).rstrip()} \n\nThis error may have been caused "
    465                f"by the following argument(s):\n{args_str}\n")
    466         e.patch_message(msg)
--> 468     error_rewrite(e, 'typing')
    469 except errors.UnsupportedError as e:
    470     # Something unsupported is present in the user code, add help info
    471     error_rewrite(e, 'unsupported_error')

File ~/python-iminuit/src/python-iminuit/test-env/lib/python3.10/site-packages/numba/core/dispatcher.py:409, in _DispatcherBase._compile_for_args.<locals>.error_rewrite(e, issue_type)
    407     raise e
    408 else:
--> 409     raise e.with_traceback(None)

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
NameError: name 'norm_nb' is not defined
[7]:
fig, ax = plt.subplots(1, 3, figsize=(8, 2), constrained_layout=True)
for par, axi in zip(m.parameters, ax):
    axi.set_title(par)
    t = truth[par]
    axi.axhline(t, ls="--", color="0.5")
    axi.errorbar(["with\n conditional"], m.values[par],
                 m.errors[par], fmt="ok")
    axi.errorbar(["without\n conditional"], m2.values[par],
                 m2.errors[par], fmt="or")
    axi.set_xlim(-0.5, 1.5)
    dt = 2 * m2.errors[par]
    axi.set_ylim(t - dt, t + dt)
../_images/notebooks_conditional_variable_10_0.svg