Source code for ethz_snow.snowfall

"""Implement Snowfall class.

This module contains the Snowfall class used to run repeated (!)
simulations of water nucleation in vials. It makes use of class
Snowflake for the individual simulations.
"""
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

import multiprocessing as mp
from typing import List, Tuple, Union, Sequence, Optional

from ethz_snow.snowflake import Snowflake

from .__init__ import __citation__, __bibtex__


[docs]class Snowfall:
    """A class to handle multiple Stochastic Nucleation of Water simulation.

    More information regarding the equations and their derivation can be found in
    "Stochastic shelf-scale modeling framework for the freezing stage in
    freeze-drying processes", Deck, Ochsenbein, and Mazzotti (2022),
    Int J Pharm, 613, 121276, https://doi.org/10.1016/j.ijpharm.2021.121276
    as well as in the Snowflake class documentation.


    Parameters:
        Nrep (int): Number of repetitions.
        pool_size (int): Size of worker pool for parallelization.
        stats (dict): Statistics for each simulation.
        stats_df (pd.DataFrame): Long-form table of all statistics.
        simulationStatus (int): Status of simulation (0 = not run, 1 = run).
    """

    def __init__(self, Nrep: int = 5, pool_size: int = None, **kwargs):
        """Construct a Snowfall object.

        Args:
            Nrep (int, optional): Number of repetitions. Defaults to 5.
            pool_size (int, optional): Size of worker pool for parallelization.
                Defaults to None (= # of cpu cores).
        """
        self.pool_size = pool_size

        self.Nrep = int(Nrep)

        if "seed" in kwargs.keys():
            # seed will be chosen by Snowfall
            del kwargs["seed"]

        if ("storeStates" in kwargs.keys()) and (kwargs["storeStates"] is not None):
            print("WARNING: States cannot be stored for Snowfall simulations.")
            del kwargs["storeStates"]

        Sf_template = Snowflake(**kwargs)
        Sf_template._buildHeatflowMatrices()  # pre-build H_int, H_ext, H_shelf

        self.Sf_template = Sf_template

        self.stats = dict()
        self.stats_df = pd.DataFrame()

    @property
    def simulationStatus(self) -> int:
        """Return simulation status of instance.

        Returns:
            int: Simulation status. 0 = not run, 1 = run.
        """
        if self.stats:
            return 1
        else:
            return 0

    @classmethod
    def _uniqueFlake(cls, S, seed):
        """Run a single Snowflake with a specific seed."""
        S.seed = seed
        S.run()
        return S.stats

    @classmethod
    def _uniqueFlake_sync(cls, S, seed, return_dict):
        """Run a single Snowflake with a specific seed.

        Run a single Snowflake with a specific seed
        and write it back into multiprocessing.Manager object for data sharing.
        """
        S.seed = seed
        S.run()
        return_dict[seed] = S.stats

[docs]    def run(self, how="async"):
        """Run all the Snowflake simulations.

        Args:
            how (str, optional): How to perform runs. Valid options are
                'async', 'sync', and 'sequential' (no parallelization).
                Defaults to "async".
        """
        # clean up old simulation
        self.stats = dict()
        self.stats_df = pd.DataFrame()

        if how == "async":
            with mp.Pool(self.pool_size) as p:
                # starmap is only available since python 3.3
                # it allows passing multiple arguments
                res = p.starmap_async(
                    Snowfall._uniqueFlake,
                    [(self.Sf_template, i) for i in range(self.Nrep)],
                ).get()
            self.stats = {i: r for i, r in enumerate(res)}

        elif how == "sync":
            manager = mp.Manager()
            return_dict = manager.dict()
            with mp.Pool(self.pool_size) as p:
                # starmap is only available since python 3.3
                # it allows passing multiple arguments
                p.starmap(
                    Snowfall._uniqueFlake_sync,
                    [(self.Sf_template, i, return_dict) for i in range(self.Nrep)],
                )
            self.stats = dict(return_dict)

        elif how == "sequential":
            for i in range(self.Nrep):
                self.stats[i] = self._uniqueFlake(self.Sf_template, i)

[docs]    def nucleationTimes(
        self,
        group: Union[str, Sequence[str]] = "all",
        seed: Union[int, Sequence[int], None] = None,
    ) -> np.ndarray:
        """Return nucleation times.

        Args:
            group (Union[str, Sequence[str]], optional): Subgroup to return.
                Defaults to "all".
            seed (Union[int, Sequence[int], None], optional): Seed(s) to return.
                Defaults to None.

        Returns:
            np.ndarray: The nucleation times.
        """
        return self._returnStats(what="tnuc", group=group, seed=seed)

[docs]    def nucleationTemperatures(
        self,
        group: Union[str, Sequence[str]] = "all",
        seed: Union[int, Sequence[int], None] = None,
    ) -> np.ndarray:
        """Return nucleation temperatures.

        Args:
            group (Union[str, Sequence[str]], optional): Subgroup to return.
                Defaults to "all".
            seed (Union[int, Sequence[int], None], optional): Seed(s) to return.
                Defaults to None.

        Returns:
            np.ndarray: The nucleation temperatures.
        """
        return self._returnStats(what="Tnuc", group=group, seed=seed)

[docs]    def solidificationTimes(
        self,
        group: Union[str, Sequence[str]] = "all",
        seed: Union[int, Sequence[int], None] = None,
    ) -> np.ndarray:
        """Return solidification times.

        Args:
            group (Union[str, Sequence[str]], optional): Subgroup to return.
                Defaults to "all".
            seed (Union[int, Sequence[int], None], optional): Seed(s) to return.
                Defaults to None.

        Returns:
            np.ndarray: The solidification times.
        """
        return self._returnStats(what="tsol", group=group, seed=seed)

    def _returnStats(
        self,
        what: str,
        group: str = "all",
        seed: Union[int, Sequence[int], None] = None,
    ):
        """Return arbitarary statistic.

        Helper function to compute arbitrary statistic for
        nucleationTimes, solidificationTimes, nucleationTemperatures.
        Args:
            what (str): What to return.
            group (Union[str, Sequence[str]], optional): Subgroup to return.
                Defaults to "all".
            seed (Union[int, Sequence[int], None], optional): Seed(s) to return.
                Defaults to None.

            Returns:
                np.ndarray: The statistic in question.
        """
        df = self.to_frame()

        if group != "all":
            if not isinstance(group, (tuple, list)):
                group = [group]
            df = df[df.group.isin(group)]

        if seed is not None:
            if not isinstance(seed, (tuple, list)):
                seed = [seed]
            df = df[df.seed.isin(seed)]

        if what == "tnuc":
            df = df[df.variable == "t_nucleation"]
        elif what == "Tnuc":
            df = df[df.variable == "T_nucleation"]
        elif what == "tsol":
            df = df[df.variable == "t_solidification"]

        return df["value"].to_numpy()

[docs]    def plot(
        self,
        what: str = "t_nucleation",
        kind: str = "box",
        seed: Union[int, Sequence[int], None] = None,
        group: Union[str, Sequence[str]] = "all",
    ):
        """Create plots for Snowfall object.

        Args:
            kind (str, optional): Any sns.catplot 'kind' input is allowed.
                Defaults to "box".
            what (str, optional): What to plot, i.e., keys of the stats dict.
                Valid options are t_nucleation, T_nucleation, t_solidification.
                Defaults to "t_nucleation".
            seed (Union[int, Sequence[int], None], optional): Seed(s) to return.
                Defaults to None.
            group (Union[str, Sequence[str]], optional): Subgroup to return.
                Defaults to "all".
        """
        df = self.to_frame()

        if group != "all":
            if not isinstance(group, (tuple, list)):
                group = [group]
            df = df[df.group.isin(group)]

        if seed is not None:
            if not isinstance(seed, (tuple, list)):
                seed = [seed]
            df = df[df.seed.isin(seed)]

        df = df[df.variable.str.contains(what)]
        sns.catplot(data=df, hue="group", y="value", kind=kind, x="variable")

[docs]    def to_frame(self) -> pd.DataFrame:
        """Save statistics as pandas dataframe.

        Raises:
            ValueError: Simulation needs to run first.

        Returns:
            pd.DataFrame: The statistics in long form.
        """
        if self.simulationStatus == 0:
            raise ValueError("Simulation needs to be run first.")
        elif self.stats_df.empty:
            stats_df = pd.DataFrame(
                columns=["group", "vial", "variable", "value", "seed"]
            )
            self.Sf_template._simulationStatus = 1
            # call Snowflake function for the first Snowflake
            # to get the right layout
            N_tot = self.Sf_template.N_vials_total
            self.Sf_template.stats = self.stats[0]
            # repeat the first df because vial and group IDs should not change
            stats_df = pd.concat([self.Sf_template.to_frame()[0]] * self.Nrep)
            # get the colum containing variable and value
            varCol, valCol = (
                stats_df.columns.get_loc("variable"),
                stats_df.columns.get_loc("value"),
            )
            for i in range(1, self.Nrep):
                stats_df.iloc[
                    i * N_tot * 3 : (i + 1) * N_tot * 3, [varCol, valCol]
                ] = pd.DataFrame(self.stats[i]).melt()
                # this assumes that the first stats_df 'melted' in the same way
                # we will add a test to ensure this assumption is not violated
            stats_df["seed"] = np.repeat(np.arange(0, self.Nrep), N_tot * 3)

            # clean up template
            self.Sf_template.stats = dict()
            self.Sf_template._simulationStatus = 0

            self.stats_df = stats_df
        elif not self.stats_df.empty:
            # do not recalculate, but load from memory
            stats_df = self.stats_df

        return stats_df

    def __repr__(self) -> str:
        """Return string representation of the Snowfall class.

        Returns:
            str: The Snowfall class string representation giving some basic info.
        """
        return (
            f"Snowfall([{self.Nrep} Snowflake{'s' if self.Nrep > 1 else ''}, "
            + f"pool_size: {self.pool_size}])"
        )