Fit PDF with conditional variable¶
In this example, we show an unusual fit where the total sample is not drawn form a single probability distribution, but each individual sample \(x\) is drawn from a different distribution, whose parameters are determined by a conditional variable \(y\).
In our example, we are drawing samples \(x\) from varying Gaussian distributions. The location of each Gaussian is a function of the conditional variable \(y\), but all share the same width parameter \(\sigma\). We fit the shared parameter \(\sigma\), but also the parameters \(a\) and \(b\) which determine how the location of each gaussian depends on \(y\), assuming a line function \(\mu = a + b y\).
This tutorial reproduces a corresponding one from RooFit.
[1]:
import iminuit
from iminuit.cost import UnbinnedNLL
from iminuit import Minuit
import numpy as np
import numba as nb
import boost_histogram as bh
import matplotlib.pyplot as plt
from scipy.stats import norm
from numba_stats import norm as norm_nb
print("iminuit version", iminuit.__version__)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Input In [1], in <cell line: 8>()
6 import boost_histogram as bh
7 import matplotlib.pyplot as plt
----> 8 from scipy.stats import norm
9 from numba_stats import norm as norm_nb
10 print("iminuit version", iminuit.__version__)
File /usr/lib/python3.10/site-packages/scipy/stats/__init__.py:467, in <module>
1 """
2 .. _statsrefmanual:
3
(...)
462
463 """
465 from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
466 DegenerateDataWarning, FitError)
--> 467 from ._stats_py import *
468 from ._variation import variation
469 from .distributions import *
File /usr/lib/python3.10/site-packages/scipy/stats/_stats_py.py:39, in <module>
36 from numpy.lib import NumpyVersion
37 from numpy.testing import suppress_warnings
---> 39 from scipy.spatial.distance import cdist
40 from scipy.ndimage import _measurements
41 from scipy._lib._util import (check_random_state, MapWrapper,
42 rng_integers, _rename_parameter)
File /usr/lib/python3.10/site-packages/scipy/spatial/__init__.py:105, in <module>
1 """
2 =============================================================
3 Spatial algorithms and data structures (:mod:`scipy.spatial`)
(...)
102 QhullError
103 """
--> 105 from ._kdtree import *
106 from ._ckdtree import *
107 from ._qhull import *
File /usr/lib/python3.10/site-packages/scipy/spatial/_kdtree.py:5, in <module>
3 import numpy as np
4 import warnings
----> 5 from ._ckdtree import cKDTree, cKDTreeNode
7 __all__ = ['minkowski_distance_p', 'minkowski_distance',
8 'distance_matrix',
9 'Rectangle', 'KDTree']
12 def minkowski_distance_p(x, y, p=2):
File _ckdtree.pyx:10, in init scipy.spatial._ckdtree()
File /usr/lib/python3.10/site-packages/scipy/sparse/__init__.py:267, in <module>
264 import warnings as _warnings
266 from ._base import *
--> 267 from ._csr import *
268 from ._csc import *
269 from ._lil import *
File /usr/lib/python3.10/site-packages/scipy/sparse/_csr.py:10, in <module>
7 import numpy as np
9 from ._base import spmatrix
---> 10 from ._sparsetools import (csr_tocsc, csr_tobsr, csr_count_blocks,
11 get_csr_submatrix)
12 from ._sputils import upcast, get_index_dtype
14 from ._compressed import _cs_matrix
ImportError: numpy.core.multiarray failed to import
[2]:
rng = np.random.default_rng(1)
# conditional variable: each sample is paired with a random y parameter
y = rng.normal(0, 10, size=10000)
y = y[np.abs(y) < 10] # truncate at 10
# location of each gaussian is a function of y
def mu(y, a, b):
return a + b * y
# draw samples from Gaussians whose locations depend on y
truth = {"a": 0, "b": 0.5, "sigma": 1.0}
x = rng.normal(mu(y, truth["a"], truth["b"]), truth["sigma"])
The distribution in \(x\) is more broad than the usual Gaussian because it is a convolution of many Gaussian distributions with varying means. We can visualise this by binning the data in \(x\) and \(y\).
[3]:
ax_x = bh.axis.Regular(100, -10, 10)
ax_y = bh.axis.Regular(5, -10, 10)
h = bh.Histogram(ax_x, ax_y)
h.fill(x, y)
for i, (a, b) in enumerate(ax_y):
plt.stairs(h.values()[:,i], ax_x.edges, label=f"[{a}, {b})",
fill=True, alpha=0.2)
h1 = h[:, sum]
plt.stairs(h1.values(), ax_x.edges, color="k", label="total")
plt.xlabel("x")
plt.ylabel("events")
plt.legend(title="y interval", frameon=False, handlelength=1.2);
Fit with conditional variable¶
The random distribution of \(x\) depends on the value of \(y\). We can exploit that information in the likelihood function to obtain a more accurate estimate of the parameters.
[4]:
def model(xy, a, b, sigma):
x, y = xy
mu = a + b * y
# cannot use norm.pdf from numba_stats here, because it is not vectorized in mu
return norm.pdf(x, mu, sigma)
nll = UnbinnedNLL((x, y), model)
m = Minuit(nll, 0.0, 0.0, 2.0)
m.limits["sigma"] = (0, None)
m.migrad()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [4], in <cell line: 11>()
9 m = Minuit(nll, 0.0, 0.0, 2.0)
10 m.limits["sigma"] = (0, None)
---> 11 m.migrad()
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/minuit.py:694, in Minuit.migrad(self, ncall, iterate)
692 if self._precision is not None:
693 migrad.precision = self._precision
--> 694 fm = migrad(ncall, self._tolerance)
695 if fm.is_valid or fm.has_reached_call_limit:
696 break
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:484, in Cost.__call__(self, *args)
469 def __call__(self, *args):
470 """
471 Evaluate the cost function.
472
(...)
482 float
483 """
--> 484 r = self._call(args)
485 if self.verbose >= 1:
486 print(args, "->", r)
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:787, in UnbinnedNLL._call(self, args)
785 def _call(self, args):
786 data = self._masked
--> 787 x = self._model(data, *args)
788 x = _normalize_model_output(x)
789 if self._log:
Input In [4], in model(xy, a, b, sigma)
3 mu = a + b * y
4 # cannot use norm.pdf from numba_stats here, because it is not vectorized in mu
----> 5 return norm.pdf(x, mu, sigma)
NameError: name 'norm' is not defined
[5]:
# construct model representation for comparison with data histogram
a, b, sigma = m.values
# get expected content per bin from cdf, sum over the individual cdfs
v = np.diff(np.sum(norm.cdf(ax_x.edges[:,np.newaxis],
mu(y, a, b), sigma), axis=1))
plt.stairs(v, ax_x.edges, label="model", zorder=5, lw=2)
plt.errorbar(ax_x.centers, h1.values(), h1.variances() ** 0.5,
fmt="ok", label="data")
plt.xlabel("x")
plt.ylabel("events")
plt.legend(frameon=False);
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [5], in <cell line: 5>()
2 a, b, sigma = m.values
4 # get expected content per bin from cdf, sum over the individual cdfs
----> 5 v = np.diff(np.sum(norm.cdf(ax_x.edges[:,np.newaxis],
6 mu(y, a, b), sigma), axis=1))
8 plt.stairs(v, ax_x.edges, label="model", zorder=5, lw=2)
9 plt.errorbar(ax_x.centers, h1.values(), h1.variances() ** 0.5,
10 fmt="ok", label="data")
NameError: name 'norm' is not defined
Fit without conditional variable¶
We can also ignore the dependence of \(x\) and \(y\) and just fit the total \(x\) distribution with a model built from the distribution of \(y\) values. This also works in this case, but information is lost and therefore the parameter uncertainties become larger than in the previous case.
On top of that, the calculation is much slower, because building the pdf is more expensive. We parallelise the computation with numba.
[6]:
nb.config.THREADING_LAYER = 'workqueue'
@nb.njit(parallel=True, fastmath=True)
def model(x, a, b, sigma):
mu = a + b * y
total = np.zeros_like(x)
for i in nb.prange(len(mu)):
total += norm_nb.pdf(x, mu[i], sigma)
return total
nll = UnbinnedNLL(x, model)
m2 = Minuit(nll, 0.0, 0.0, 2.0)
m2.limits["sigma"] = (0, None)
m2.migrad()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xf
---------------------------------------------------------------------------
TypingError Traceback (most recent call last)
Input In [6], in <cell line: 16>()
14 m2 = Minuit(nll, 0.0, 0.0, 2.0)
15 m2.limits["sigma"] = (0, None)
---> 16 m2.migrad()
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/minuit.py:694, in Minuit.migrad(self, ncall, iterate)
692 if self._precision is not None:
693 migrad.precision = self._precision
--> 694 fm = migrad(ncall, self._tolerance)
695 if fm.is_valid or fm.has_reached_call_limit:
696 break
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:484, in Cost.__call__(self, *args)
469 def __call__(self, *args):
470 """
471 Evaluate the cost function.
472
(...)
482 float
483 """
--> 484 r = self._call(args)
485 if self.verbose >= 1:
486 print(args, "->", r)
File ~/python-iminuit/src/python-iminuit/build/lib.linux-x86_64-cpython-310/iminuit/cost.py:787, in UnbinnedNLL._call(self, args)
785 def _call(self, args):
786 data = self._masked
--> 787 x = self._model(data, *args)
788 x = _normalize_model_output(x)
789 if self._log:
File ~/python-iminuit/src/python-iminuit/test-env/lib/python3.10/site-packages/numba/core/dispatcher.py:468, in _DispatcherBase._compile_for_args(self, *args, **kws)
464 msg = (f"{str(e).rstrip()} \n\nThis error may have been caused "
465 f"by the following argument(s):\n{args_str}\n")
466 e.patch_message(msg)
--> 468 error_rewrite(e, 'typing')
469 except errors.UnsupportedError as e:
470 # Something unsupported is present in the user code, add help info
471 error_rewrite(e, 'unsupported_error')
File ~/python-iminuit/src/python-iminuit/test-env/lib/python3.10/site-packages/numba/core/dispatcher.py:409, in _DispatcherBase._compile_for_args.<locals>.error_rewrite(e, issue_type)
407 raise e
408 else:
--> 409 raise e.with_traceback(None)
TypingError: Failed in nopython mode pipeline (step: nopython frontend)
NameError: name 'norm_nb' is not defined
[7]:
fig, ax = plt.subplots(1, 3, figsize=(8, 2), constrained_layout=True)
for par, axi in zip(m.parameters, ax):
axi.set_title(par)
t = truth[par]
axi.axhline(t, ls="--", color="0.5")
axi.errorbar(["with\n conditional"], m.values[par],
m.errors[par], fmt="ok")
axi.errorbar(["without\n conditional"], m2.values[par],
m2.errors[par], fmt="or")
axi.set_xlim(-0.5, 1.5)
dt = 2 * m2.errors[par]
axi.set_ylim(t - dt, t + dt)