Source code for flamedisx.source

from copy import copy
from contextlib import contextmanager
import inspect
import typing as ty
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp

from tqdm import tqdm

import flamedisx as fd
export, __all__ = fd.exporter()

o = tf.newaxis


[docs]@export class Source: #: Number of event batches to use in differential rate computations n_batches = None #: Number of fake events that were padded to the final batch #: to make it match the batch size n_padding = None #: Whether to trace (compile into a tensorflow graph) the differential #: rate computation trace_difrate = True default_max_sigma = 3 #: Names of model functions model_functions: ty.Tuple[str] = tuple() #: Names of model functions that take an additional first argument #: ('bonus arg'). This must be a subset of model_functions. special_model_functions: ty.Tuple[str] = tuple() #: Model functions whose results should be evaluated once per event, #: then stored with the data. For example, non-tensorflow functions. #: Note these cannot have any fittable parameters. frozen_model_functions: ty.Tuple[str] = tuple() #: Names of final observable dimensions (e.g. s1, s2) #: for use in domain / cross-domain final_dimensions: ty.Tuple[str] = tuple() #: Names of dimensions of hidden variables (e.g. produced electrons) #: for which domain computations and dimsize calculations are to be done inner_dimensions: ty.Tuple[str] = tuple() #: inner_dimensions excluded from variable stepping logic, i.e. #: for which the domain is always a single interval of integers no_step_dimensions: ty.Tuple[str] = tuple() #: Names of dimensions of hidden variables for which #: dimsize calculations are NOT done here (but in user-defined code) #: but for which we DO track _min and _dimsizes bonus_dimensions: ty.Tuple[str] = tuple() #: Names of array-valued data columns array_columns: ty.Tuple[str] = tuple() #: Any additional source attributes that should be configurable. model_attributes = tuple() # List all columns that are manually _fetch ed here # These will be added to the data_tensor even when the model function # inspection will not find them.
[docs] def extra_needed_columns(self): return []
#: The fully annotated event data data: pd.DataFrame = None ## # Initialization and helpers ##
[docs] def scan_model_functions(self): """Discover which functions need which arguments / dimensions Discover possible parameters. Returns f_dims, f_params and defaults. """ self.f_dims = f_dims = {x: [] for x in self.model_functions} self.f_params = f_params = {x: [] for x in self.model_functions} self.defaults = defaults = dict() for fname in self.model_functions: f = getattr(self, fname) if not callable(f): # Constant continue seen_special = False for pname, p in inspect.signature(f).parameters.items(): if pname == 'self': continue if pname in self.model_functions: raise AttributeError( f"{pname} is used both as a model function and " f"as a parameter of a model function, {fname}") if p.default is inspect.Parameter.empty: if fname in self.special_model_functions and not seen_special: seen_special = True else: # It's an observable dimension f_dims[fname].append(pname) else: # It's a parameter that can be fitted f_params[fname].append(pname) if pname in defaults and p.default != defaults[pname]: raise ValueError(f"Inconsistent defaults for {pname}") defaults[pname] = tf.convert_to_tensor( p.default, dtype=fd.float_type())
[docs] def print_config(self, format='table', column_widths=(40, 20), omit=tuple()): """Print the defaults of all parameters (from Source.defaults), and of model functions that have been set to constants (from Source.f_dims) :param format: 'table' to print a fixed-width table, 'config' to print as a configuration file :param column_widths: 2-tuple of column widths to use for table format :param omit: settings to omit from printout. Useful for format='config', since some things (like arrays and tensors) cannot be evaluated from their string representation. """ def print_row(*cols, header=False): cols = [str(x).replace('\n', '') for x in cols] if format == 'table': print(''.join([ col.ljust(w) if len(col) < w else col[:w - 3] + '...' for col, w in zip(cols, column_widths)])) else: result = '# ' if header else '' result += ' = '.join(cols) print(result) format_value = str if format == 'table' else repr def print_line(marker='-'): if format == 'table': print(marker * sum(column_widths)) else: print() print_row('Parameter', 'Default', header=True) print_line() for pname, default in sorted(self.defaults.items()): if pname in omit: continue print_row(pname, format_value(default.numpy())) print() print_row("Constant (could be made a function)", 'Default', header=True) print_line() for fname in sorted(self.model_functions): if fname in omit: continue f = getattr(self, fname) if not callable(f): print_row(fname, format_value(f)) print() print_row("Other attribute", 'Default', header=True) print_line() for fname in sorted(self.model_attributes): if fname in omit: continue print_row(fname, format_value(getattr(self, fname))) print()
def __init__(self, data=None, batch_size=10, max_sigma=None, max_dim_size=120, data_is_annotated=False, _skip_tf_init=False, _skip_bounds_computation=False, fit_params=None, progress=False, **params): """Initialize a flamedisx source :param data: Dataframe with events to use in the inference :param batch_size: Number of events / tensorflow batch :param max_sigma: Hint for hidden variable bounds computation If omitted, set to default_max_sigma :param max_dim_size: Maximum bounds size for inner_dimensions, excluding no_step_dimensions :param data_is_annotated: If True, skip annotation :param _skip_tf_init: If True, skip tensorflow cache initialization :param _skip_bounds_computation: If True, skip bounds compuation :param fit_params: List of parameters to fit :param progress: whether to show progress bars for mu estimation (if data is not None) :param params: New defaults to for parameters, and new values for constant-valued model functions. """ if max_sigma is None: max_sigma = self.default_max_sigma self.max_sigma = max_sigma self.max_dim_size = max_dim_size # Check for duplicated model functions for attrname in ['model_functions', 'special_model_functions']: l_ = getattr(self, attrname) if len(set(l_)) != len(l_): raise ValueError(f"{attrname} contains duplicates: {l_}") # Check all special model functions are actually model functions for fname in self.special_model_functions: if fname not in self.model_functions: raise ValueError( f"{attrname} is listed as a special model function, " f"but not as a model function") # Discover which functions need which arguments / dimensions # Discover possible parameters. self.scan_model_functions() # Change from (column, length) tuple to dict self.array_columns = dict(self.array_columns) # Which columns are needed from data? ctc = list(set(sum(self.f_dims.values(), []))) # Used in model functions. ctc += list(self.final_dimensions) # Final observables (e.g. S1, S2) ctc += self.extra_needed_columns() # Manually fetched columns ctc += self.frozen_model_functions # Frozen methods (e.g. not tf-compatible) ctc += [x + '_min' for x in self.inner_dimensions] # Left bounds of domains ctc += [x + '_max' for x in self.inner_dimensions] # Right bounds of domains ctc += [x + '_min' for x in self.bonus_dimensions] # Left bounds of domains ctc += [x + '_steps' for x in self.inner_dimensions] # Step sizes ctc += [x + '_steps' for x in self.bonus_dimensions] # Step sizes ctc += [x + '_dimsizes' for x in self.inner_dimensions] # Dimension sizes ctc += [x + '_dimsizes' for x in self.bonus_dimensions] # Dimension sizes ctc += [x + '_dimsizes' for x in self.final_dimensions] # Dimension sizes ctc = list(set(ctc)) self.column_index = fd.index_lookup_dict(ctc, column_widths=self.array_columns) self.n_columns_in_data_tensor = ( len(self.column_index) + sum(self.array_columns.values()) - len(self.array_columns)) self.set_defaults(**params) if fit_params is None: fit_params = list(self.defaults.keys()) # Filter out parameters the source does not use self.fit_params = [x for x in fit_params if x in self.defaults] self.parameter_index = fd.index_lookup_dict(self.defaults.keys()) # Indices of params we actually want to fit; we have to differentiate wrt these self.fit_param_indices = tuple([ self.parameter_index[param_name] for param_name in self.fit_params]) if data is None: # We're calling the source without data. Set the batch_size here # since we can't pass it to set_data later self.batch_size = batch_size else: self.batch_size = min(batch_size, len(data)) self.set_data(data, progress=progress, data_is_annotated=data_is_annotated, _skip_tf_init=_skip_tf_init, _skip_bounds_computation=_skip_bounds_computation) if not _skip_tf_init: self.trace_differential_rate()
[docs] def set_defaults(self, *, config=None, **params): # Load new params from configuration files params = {**fd.load_config(config), **params} # Apply new defaults unused = dict() for k, v in params.items(): if k in self.defaults: # Change a default self.defaults[k] = tf.convert_to_tensor( v, dtype=fd.float_type()) elif k in self.model_functions: # Change a model function, only allowed if it is a constant # (otherwise, just subclass the source) f = getattr(self, k) if callable(f): raise AttributeError( f"Use source subclassing to override the non-constant " f"model function {k}") setattr(self, k, v) elif k in self.model_attributes: # Change a generic model attribute setattr(self, k, v) else: unused[k] = v if unused: warnings.warn(f"Defaults for unused settings ignored: {unused}")
[docs] def set_data(self, data=None, data_is_annotated=False, _skip_tf_init=False, _skip_bounds_computation=False, **params): self.set_defaults(**params) if data is None: self.data = self.n_batches = self.n_padding = None return self.data = data del data # Annotate requests n_events, currently no padding self.n_padding = 0 self.n_events = len(self.data) self.n_batches = np.ceil( self.n_events / self.batch_size).astype(np.int) if not _skip_tf_init: # Extend dataframe with events to nearest batch_size multiple # We're using actual events for padding, since using zeros or # nans caused problems with gradient calculation # padded events are clipped when summing likelihood terms self.n_padding = self.n_batches * self.batch_size - len(self.data) if self.n_padding > 0: # Repeat first event n_padding times and concat to rest of data df_pad = self.data.iloc[np.zeros(self.n_padding)] self.data = pd.concat([self.data, df_pad], ignore_index=True) if not data_is_annotated: self.add_extra_columns(self.data) if not _skip_bounds_computation: self._annotate() self._calculate_dimsizes(self.max_dim_size) if not _skip_tf_init: self._check_data() self._populate_tensor_cache()
[docs] def _check_data(self): """Do any final checks on the self.data dataframe, before passing it on to the tensorflow layer. """ for column in self.column_index: if (column not in self.data.columns and column not in self.frozen_model_functions): raise ValueError(f"Data lacks required column {column}; " f"did annotation happen correctly?")
[docs] def _populate_tensor_cache(self): """Set self.data_tensor to a big tensor of shape: (n_batches, events_per_batch, n_columns_in_data_tensor) """ # First, build a list of (n_events, 1 or column_width) tensors result = [] for column in self.column_index: if column in self.frozen_model_functions: # Calculate the column y = self.gimme(column) else: # Just fetch it from the dataframe y = self._fetch(column) y = tf.cast(y, dtype=fd.float_type()) # For non-array columns, add size-1 axis for concatenation if len(y.shape) == 1: assert column not in self.array_columns y = tf.reshape(y, (len(y), 1)) result.append(y) # Concat these and shape them to the batch size result = tf.concat(result, axis=1) self.data_tensor = tf.reshape( result, [self.n_batches, -1, self.n_columns_in_data_tensor])
[docs] def cap_dimsizes(self, dim, cap): if dim in self.no_step_dimensions: pass else: self.dimsizes[dim] = cap * np.greater(self.dimsizes[dim], cap) + \ self.dimsizes[dim] * np.less_equal(self.dimsizes[dim], cap)
[docs] def _calculate_dimsizes(self, max_dim_size): self.dimsizes = dict() d = self.data for dim in self.inner_dimensions: ma = d[dim + '_max'].to_numpy() mi = d[dim + '_min'].to_numpy() self.dimsizes[dim] = ma - mi + 1 # Ensure we don't go over max_dim_size in a domain self.cap_dimsizes(dim, max_dim_size) # Calculate steps if we have cappeed the dimesize steps = tf.where((ma-mi+1) > self.dimsizes[dim], tf.math.ceil((ma-mi) / (self.dimsizes[dim]-1)), 1).numpy() # Cover to at least the upper bound # Store the steps in the dataframe d[dim + '_steps'] = steps for dim in self.final_dimensions: self.dimsizes[dim] = np.ones(len(d)) # Calculate all custom dimsizes self.calculate_dimsizes_special() # Store the dimsizes in the dataframe for dim in self.inner_dimensions: d[dim + "_dimsizes"] = self.dimsizes[dim] for dim in self.bonus_dimensions: d[dim + "_dimsizes"] = self.dimsizes[dim] for dim in self.final_dimensions: d[dim + "_dimsizes"] = self.dimsizes[dim]
[docs] @contextmanager def _set_temporarily(self, data, **kwargs): """Set data and/or defaults temporarily, without affecting the data tensor state""" if data is None: raise ValueError("No point in setting data = None temporarily") old_defaults = copy(self.defaults) if data is None: self.set_defaults(**kwargs) else: if self.data is None: old_data = None else: old_data = self.data[:self.n_events] # Remove padding self.set_data(data, **kwargs, _skip_tf_init=True) try: yield finally: self.defaults = old_defaults if old_data is not None: self.set_data( old_data, data_is_annotated=True, _skip_tf_init=True)
[docs] def annotate_data(self, data, **params): """Add columns to data with inference information""" with self._set_temporarily(data, **params): self._annotate() return self.data
## # Data fetching / calculation ##
[docs] def _fetch(self, x, data_tensor=None): """Return a tensor column from the original dataframe (self.data) :param x: column name :param data_tensor: Data tensor, columns as in self.column_index """ if data_tensor is None: # We're inside annotate, just return the column x = self.data[x].values if x.dtype == np.object: # This will only work on homogeneous array fields x = np.stack(x) return fd.np_to_tf(x) return data_tensor[:, self.column_index[x]]
[docs] def _fetch_param(self, param, ptensor): if ptensor is None: return self.defaults[param] idx = tf.dtypes.cast(self.parameter_index[param], dtype=fd.int_type()) return ptensor[idx]
[docs] def gimme(self, fname, *, data_tensor=None, ptensor=None, bonus_arg=None, numpy_out=False): """Evaluate the model function fname with all required arguments :param fname: Name of the model function to compute :param bonus_arg: If fname takes a bonus argument, the data for it :param numpy_out: If True, return (tuple of) numpy arrays, otherwise (tuple of) tensors. :param data_tensor: Data tensor, columns as self.column_index If not given, use self.data (used in annotate) :param ptensor: Parameter tensor, columns as self.param_id If not given, use defaults dictionary (used in annotate) Before using gimme, you must use set_data to populate the internal caches. """ assert (bonus_arg is not None) == (fname in self.special_model_functions) assert isinstance(fname, str), \ f"gimme needs fname to be a string, not {type(fname)}" if data_tensor is None: # We're in an annotate assert hasattr(self, 'data'), "You must set data first" else: # We're computing if not hasattr(self, 'data_tensor'): raise ValueError( "You must set_data first (and populate the tensor cache)") f = getattr(self, fname) # Frozen data methods should not be called again, # just fetch them from the data tensor (if we have one) if fname in self.frozen_model_functions: if data_tensor is not None: return self._fetch(fname, data_tensor) if callable(f): args = [self._fetch(x, data_tensor) for x in self.f_dims[fname]] if bonus_arg is not None: args = [bonus_arg] + args kwargs = {pname: self._fetch_param(pname, ptensor) for pname in self.f_params[fname]} res = f(*args, **kwargs) else: if bonus_arg is None: n = len(self.data) if data_tensor is None else data_tensor.shape[0] x = tf.ones(n, dtype=fd.float_type()) else: x = tf.ones_like(bonus_arg, dtype=fd.float_type()) res = f * x if numpy_out: return fd.tf_to_np(res) return fd.np_to_tf(res)
[docs] def gimme_numpy(self, fname, bonus_arg=None): """Gimme for use in simulation / annotate""" return self.gimme(fname=fname, data_tensor=None, ptensor=None, bonus_arg=bonus_arg, numpy_out=True)
gimme_numpy.__doc__ = gimme.__doc__ ## # Differential rate computation ##
[docs] def batched_differential_rate(self, progress=True, **params): """Return numpy array with differential rate for all events. """ progress = (lambda x: x) if not progress else tqdm y = [] for i_batch in progress(range(self.n_batches)): q = self.data_tensor[i_batch] y.append(fd.tf_to_np(self.differential_rate(data_tensor=q, **params))) return np.concatenate(y)[:self.n_events]
[docs] def _batch_data_tensor_shape(self): return [self.batch_size, self.n_columns_in_data_tensor]
[docs] def trace_differential_rate(self): """Compile the differential rate computation to a tensorflow graph""" input_signature = ( tf.TensorSpec(shape=self._batch_data_tensor_shape(), dtype=fd.float_type()), tf.TensorSpec(shape=[len(self.parameter_index)], dtype=fd.float_type())) self._differential_rate_tf = tf.function( self._differential_rate, input_signature=input_signature)
[docs] def differential_rate(self, data_tensor=None, autograph=True, **kwargs): ptensor = self.ptensor_from_kwargs(**kwargs) if autograph and self.trace_difrate: return self._differential_rate_tf( data_tensor=data_tensor, ptensor=ptensor) else: return self._differential_rate( data_tensor=data_tensor, ptensor=ptensor)
[docs] def ptensor_from_kwargs(self, **kwargs): return tf.convert_to_tensor([kwargs.get(k, self.defaults[k]) for k in self.defaults])
## # Helpers for response model implementation ##
[docs] def domain(self, x, data_tensor=None): """Return (n_events, n_x) matrix containing all possible integer values of x for each event. If x is a final dimension (e.g. s1, s2), we return an (n_events, 1) tensor with observed values -- NOT a (n_events,) array! """ if x in self.final_dimensions: return self._fetch(x, data_tensor=data_tensor)[:, o] # Cover the bounds range in integer steps not necessarily of 1 left_bound = self._fetch(x + '_min', data_tensor=data_tensor)[:, o] steps = self._fetch(x + '_steps', data_tensor=data_tensor)[:, o] x_range = tf.range(tf.reduce_max(self._fetch(x + '_dimsizes', data_tensor=data_tensor))) * steps return left_bound + x_range
[docs] def cross_domains(self, x, y, data_tensor): """Return (x, y) two-tuple of (n_events, n_x, n_y) tensors containing possible integer values of x and y, respectively. """ # TODO: somehow mask unnecessary elements and save computation time x_domain = self.domain(x, data_tensor) y_domain = self.domain(y, data_tensor) result_x = tf.repeat(x_domain[:, :, o], tf.shape(y_domain)[1], axis=2) result_y = tf.repeat(y_domain[:, o, :], tf.shape(x_domain)[1], axis=1) return result_x, result_y
## # Simulation methods and helpers ##
[docs] def simulate(self, n_events, fix_truth=None, full_annotate=False, **params): """Simulate n events. Will omit events lost due to selection/detection efficiencies """ assert isinstance(n_events, (int, float)), \ f"n_events must be an int or float, not {type(n_events)}" # Draw random "deep truth" variables (energy, position) # Pass on a copy of the dict or DataFrame fix_truth = self.validate_fix_truth(fix_truth.copy() if fix_truth is not None else None) sim_data = self.random_truth(n_events, fix_truth=fix_truth, **params) assert isinstance(sim_data, pd.DataFrame) with self._set_temporarily(sim_data, _skip_bounds_computation=True, **params): # Do the forward simulation of the detector response d = self._simulate_response() if full_annotate: # Now that we have s1 and s2 values, we can populate # columns like e_vis, photon_produced_mle, etc. # This is optional since it can be expensive (e.g. for # the WIMPsource, where it includes the full energy spectrum!) return self.annotate_data(d) return d
[docs] def validate_fix_truth(self, fix_truth): """Return checked fix truth, with extra derived variables if needed""" return fix_truth
[docs] @staticmethod def _overwrite_fixed_truths(data, fix_truth, n_events): """Replaces all columns in data with fix_truth. Careful: ensure mutual constraints are accounted for first! (e.g. fixing energy for a modulating WIMP has consequences for the time distribution.) """ if fix_truth is not None: for k, v in fix_truth.items(): data[k] = np.ones(n_events, dtype=np.float) * v
## # Mu estimation ##
[docs] def mu_function(self, interpolation_method='star', n_trials=int(1e5), progress=True, **param_specs): """Return interpolator for number of expected events Parameters must be specified as kwarg=(start, stop, n_anchors) """ if interpolation_method != 'star': raise NotImplementedError( f"mu interpolation method {interpolation_method} " f"not implemented") # Estimate mu under the current defaults base_mu = tf.constant(self.estimate_mu(n_trials=n_trials), dtype=fd.float_type()) # Estimate mus under the specified variations pspaces = dict() # parameter -> tf.linspace of anchors mus = dict() # parameter -> tensor of mus _iter = param_specs.items() if progress: _iter = tqdm(_iter, desc="Estimating mus") for pname, (start, stop, n) in _iter: if pname not in self.defaults: # We don't take this parameter. Consistent with __init__, # don't complain and just discard it silently. continue # Parameters are floats, but users might input ints as anchors # accidentally, triggering a confusing tensorflow device placement # message start, stop = float(start), float(stop) pspaces[pname] = tf.linspace(start, stop, n) mus[pname] = tf.convert_to_tensor( [self.estimate_mu(**{pname: x}, n_trials=n_trials) for x in np.linspace(start, stop, n)], dtype=fd.float_type()) def mu_itp(**kwargs): mu = base_mu for pname, v in kwargs.items(): mu *= tfp.math.interp_regular_1d_grid( x=v, x_ref_min=param_specs[pname][0], x_ref_max=param_specs[pname][1], y_ref=mus[pname]) / base_mu return mu return mu_itp
[docs] def estimate_mu(self, n_trials=int(1e5), **params): """Return estimate of total expected number of events :param n_trials: Number of events to simulate for estimate """ d_simulated = self.simulate(n_trials, **params) return (self.mu_before_efficiencies(**params) * len(d_simulated) / n_trials)
## # Functions you have to override ##
[docs] def _differential_rate(self, data_tensor, ptensor): raise NotImplementedError
[docs] def mu_before_efficiencies(self, **params): """Return mean expected number of events BEFORE efficiencies/response using data for the evaluation of the energy spectra """ raise NotImplementedError
## # Functions you probably should override ##
[docs] def _annotate(self): """Add columns needed in inference to self.data """
[docs] def add_extra_columns(self, data): """Add additional columns to data :param data: pandas DataFrame """
[docs] def random_truth(self, n_events, fix_truth=None, **params): """Draw random "deep truth" variables (energy, position) """ raise NotImplementedError
[docs] def _simulate_response(self): """Do a forward simulation of the detector response, using self.data""" return self.data
[docs]@export class ColumnSource(Source): """Source that expects precomputed differential rate in a column, and precomputed mu in an attribute """ #: Name of the data column containing the precomputed differential rate column = 'rename_me!' #: Expected events for this source mu = 42.
[docs] def extra_needed_columns(self): return super().extra_needed_columns() + [self.column]
[docs] def estimate_mu(self, n_trials=None, **params): return self.mu
[docs] def mu_before_efficiencies(self, **params): return self.mu
[docs] def _differential_rate(self, data_tensor, ptensor): return self._fetch(self.column, data_tensor)
[docs] def random_truth(self, n_events, fix_truth=None, **params): print(f"{self.__class__.__name__} cannot generate events, skipping") return pd.DataFrame()
[docs] def calculate_dimsizes_special(self): pass