Source code for pypesto.profile.validation_intervals

"""Validation intervals."""

import logging
from copy import deepcopy
from typing import Optional

from scipy.stats import chi2

from ..engine import Engine
from ..optimize import Optimizer, minimize
from ..problem import Problem
from ..result import Result

logger = logging.getLogger(__name__)


[docs] def validation_profile_significance( problem_full_data: Problem, result_training_data: Result, result_full_data: Optional[Result] = None, n_starts: Optional[int] = 1, optimizer: Optional[Optimizer] = None, engine: Optional[Engine] = None, lsq_objective: bool = False, return_significance: bool = True, ) -> float: r""" Compute significance of Validation Interval. It is a confidence region/interval for a new validation experiment. [#Kreutz]_ et al. (This method per default returns the significance = 1-alpha!) The reasoning behind their approach is, that a validation data set is outside the validation interval, if fitting the full data set would lead to a fit :math:`\theta_{new}`, that does not contain the old fit :math:`\theta_{train}` in their (Profile-Likelihood) based parameter-confidence intervals. (I.e. the old fit would be rejected by the fit of the full data.) This method returns the significance of the validation data set (where `result_full_data` is the objective function for fitting both data sets). I.e. the largest alpha, such that there is a validation region/interval such that the validation data set lies outside this Validation Interval with probability alpha. (If one is interested in the opposite, set `return_significance=False`.) Parameters ---------- problem_full_data: pypesto.problem, such that the objective is the negative-log-likelihood of the training and validation data set. result_training_data: Result object from the fitting of the training data set only. result_full_data Result object that contains the result of fitting training and validation data combined. n_starts number of starts for fitting the full data set (if `result_full_data` is not provided). optimizer: optimizer used for refitting the data (if `result_full_data` is not provided). engine: engine for refitting (if `result_full_data` is not provided). lsq_objective: indicates if the objective of `problem_full_data` corresponds to a nllh (``False``), or a :math:`\chi^2` value (``True``). return_significance: indicates, if the function should return the significance (``True``) (i.e. the probability, that the new data set lies outside the Confidence Interval for the validation experiment, as given by the method), or the largest alpha, such that the validation experiment still lies within the Confidence Interval (``False``). I.e. :math:`\alpha = 1-significance`. .. [#Kreutz] Kreutz, Clemens, Raue, Andreas and Timmer, Jens. “Likelihood based observability analysis and confidence intervals for predictions of dynamic models”. BMC Systems Biology 2012/12. doi:10.1186/1752-0509-6-120 """ if (result_full_data is not None) and (optimizer is not None): raise UserWarning( "optimizer will not be used, as a result object " "for the full data set is provided." ) # if result for full data is not provided: minimize if result_full_data is None: x_0 = result_training_data.optimize_result.x # copy problem, in order to not change/overwrite x_guesses problem = deepcopy(problem_full_data) problem.set_x_guesses(x_0) result_full_data = minimize( problem=problem, optimizer=optimizer, n_starts=n_starts, engine=engine, progress_bar=False, ) # Validation intervals compare the nllh value on the full data set # of the parameter fits from the training and the full data set. nllh_new = result_full_data.optimize_result.fval[0] nllh_old = problem_full_data.objective( problem_full_data.get_reduced_vector( result_training_data.optimize_result.x[0] ) ) if nllh_new > nllh_old: logger.warning( "Fitting of the full data set provided a worse fit " "than the fit only considering the training data. " "Consider rerunning the not handing over " "result_full_data or running the fitting from the " "best parameters found from the training data." ) # compute the probability, that the validation data set is outside the CI # => survival function chi.sf if return_significance: if lsq_objective: return chi2.sf(nllh_old - nllh_new, 1) else: return chi2.sf(2 * (nllh_old - nllh_new), 1) # compute the probability, that the validation data set is inside the CI # => cumulative density function chi.cdf else: if lsq_objective: return chi2.cdf(nllh_old - nllh_new, 1) else: return chi2.cdf(2 * (nllh_old - nllh_new), 1)