Reference for coalescent approach in multitask learning

`bmm_multitask_learning.coalescent`

`coalescent_inference`

`CoalescentTree`

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

class CoalescentTree:
    def __init__(self, leaves: list[Item], cov, dim):
        self.leaves = leaves
        self.items, _ = self._greedy_rate_brownian(leaves,  cov, dim)

    def iterate_over(self) -> Iterator[Item]:
        """
        iterates over all elements in tree
        """
        for level in self.items:
            for node in level:
                yield node

    @staticmethod
    def select_candidates(x, cov, t_m1, n, dim):
        """
        Realise greedy selection of candidates to coalesce.
        For this consider all pairs of items and select minimal delta for coalesce

        More details in [2]
        """
        n_cur = len(x)
        i_items = n - n_cur

        l, r = None, None
        min_time = float("inf")

        for i in range(n_cur):
            for j in range(i + 1, n_cur):
                t_new = Item.optimal_t(x[i], x[j], cov, i_items, t_m1, n, dim)
                if t_new < min_time:
                    min_time, l, r = t_new, i, j
        return l, r, min_time

    def _greedy_rate_brownian(self, x: list[Item], cov, dim)\
            -> tuple[list[list[Item]], list[tuple[int, int]]]:
        """
        Realises greedy coalescent three construction
        More details in [2]

        :return: first argument is a constructed coalesce structure, 
            the second argument is a pointer to child elements
        """

        n = len(x)
        y = [x]
        coalesced_items = []
        t = 0
        for _ in range(n-1):
            coalecse_candidates = copy(y[-1])
            l, r, delta = self.select_candidates(coalecse_candidates,
                                                cov,
                                                t,
                                                n,
                                                dim)
            coalesced_items.append((l, r))
            t = t - delta
            new_item = Item.coalesce(coalecse_candidates[l],
                                    coalecse_candidates[r],
                                    t)

            coalecse_candidates[l].parent = new_item
            coalecse_candidates[r].parent = new_item

            coalecse_candidates[l] = new_item
            del coalecse_candidates[r]
            y.append(coalecse_candidates)
        return y, coalesced_items

`iterate_over()`

iterates over all elements in tree

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

def iterate_over(self) -> Iterator[Item]:
    """
    iterates over all elements in tree
    """
    for level in self.items:
        for node in level:
            yield node

`select_candidates(x, cov, t_m1, n, dim)` `staticmethod`

Realise greedy selection of candidates to coalesce. For this consider all pairs of items and select minimal delta for coalesce

More details in [2]

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

@staticmethod
def select_candidates(x, cov, t_m1, n, dim):
    """
    Realise greedy selection of candidates to coalesce.
    For this consider all pairs of items and select minimal delta for coalesce

    More details in [2]
    """
    n_cur = len(x)
    i_items = n - n_cur

    l, r = None, None
    min_time = float("inf")

    for i in range(n_cur):
        for j in range(i + 1, n_cur):
            t_new = Item.optimal_t(x[i], x[j], cov, i_items, t_m1, n, dim)
            if t_new < min_time:
                min_time, l, r = t_new, i, j
    return l, r, min_time

`Item` `dataclass`

class to handle the items of coalescent_tree

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

@dataclass
class Item:
    """
    class to handle the items of coalescent_tree
    """
    mean: list
    cov: float = 0.
    t: float = 0.
    parent = None

    @staticmethod
    def coalesce(first, other, t,):
        """
        This method coalesces two Item objects. Works for brownian diffusion.
        For more see [2]

        [2] Y.W. Teh, H. Dauḿe III, and D. Roy. 
            Bayesian agglomerative clustering with coalescents. NIPS, 2007.
        """
        # messages coalesce
        # assert (self.cov + self.t - t) > 0
        # assert (other.cov + other.t - t) > 0

        l_inv = 1 / (first.cov + (first.t - t))
        r_inv = 1 / (other.cov + (other.t - t))
        new_cov = 1/(l_inv + r_inv)
        new_mean = (first.mean * l_inv + other.mean * r_inv) * new_cov

        new_t = t
        return Item(new_mean, new_cov, new_t)

    @staticmethod
    def optimal_t(first, other, cov, i, t_m1, n, dim):
        """
        computes optimal delta to coalesce for proposed pair of items
        """
        c_ni = ((n - i + 1) * (n-i)/2)

        a = 1/4 * 1/c_ni * \
            ((4 * c_ni * m_norm(first.mean - other.mean, cov) +
                dim*dim) ** 0.5 - dim)

        b = 1/2 * (first.cov + other.cov + first.t + other.t - 2 * t_m1)

        assert a - b > 0, f"t_m1: {t_m1}, a: {a}, b: {b}"
        return a - b

`coalesce(first, other, t)` `staticmethod`

This method coalesces two Item objects. Works for brownian diffusion. For more see [2]

[2] Y.W. Teh, H. Dauḿe III, and D. Roy. Bayesian agglomerative clustering with coalescents. NIPS, 2007.

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

@staticmethod
def coalesce(first, other, t,):
    """
    This method coalesces two Item objects. Works for brownian diffusion.
    For more see [2]

    [2] Y.W. Teh, H. Dauḿe III, and D. Roy. 
        Bayesian agglomerative clustering with coalescents. NIPS, 2007.
    """
    # messages coalesce
    # assert (self.cov + self.t - t) > 0
    # assert (other.cov + other.t - t) > 0

    l_inv = 1 / (first.cov + (first.t - t))
    r_inv = 1 / (other.cov + (other.t - t))
    new_cov = 1/(l_inv + r_inv)
    new_mean = (first.mean * l_inv + other.mean * r_inv) * new_cov

    new_t = t
    return Item(new_mean, new_cov, new_t)

`optimal_t(first, other, cov, i, t_m1, n, dim)` `staticmethod`

computes optimal delta to coalesce for proposed pair of items

Source code in bmm_multitask_learning/coalescent/coalescent_inference.py

@staticmethod
def optimal_t(first, other, cov, i, t_m1, n, dim):
    """
    computes optimal delta to coalesce for proposed pair of items
    """
    c_ni = ((n - i + 1) * (n-i)/2)

    a = 1/4 * 1/c_ni * \
        ((4 * c_ni * m_norm(first.mean - other.mean, cov) +
            dim*dim) ** 0.5 - dim)

    b = 1/2 * (first.cov + other.cov + first.t + other.t - 2 * t_m1)

    assert a - b > 0, f"t_m1: {t_m1}, a: {a}, b: {b}"
    return a - b

`coalescent_learner`

`MultitaskProblem`

bayessian optimizer for Multitask Learning based on Coalescent [1]

To use initialize with list of TaskData and call run. Then trained weights will be available by method get_weights()

[1] @article{daume2009bayesian, title={Bayesian multitask learning with latent hierarchies}, author={Daum{'e} III, Hal}, journal={arXiv preprint arXiv:0907.0783}, year={2009} }

Source code in bmm_multitask_learning/coalescent/coalescent_learner.py

class MultitaskProblem:
    """
    bayessian optimizer for Multitask Learning based on Coalescent [1]

    To use initialize with list of TaskData and call run. Then trained weights 
    will be available by method get_weights()




    [1] @article{daume2009bayesian,
            title={Bayesian multitask learning with latent hierarchies},
            author={Daum{\'e} III, Hal},
            journal={arXiv preprint arXiv:0907.0783},
            year={2009}
        }

    """
    def __init__(self,
            tasks: list[TaskData],
            dim,
            rho=0.05,
            cov_sigma=0.1,
            s_init=None
        ):
        """

        :param tasks: list of TaskData, which will be learned
        :param dim: dimention of problems
        :param rho: parameter that scales noise level in labels
        :param cov_sigma: covariance matrix scaler for Coalescent evolution variation
        :param s_init: initial values for S_i: variance scales of data
        """

        self.tasks = tasks
        self.K = len(tasks)
        self.dim = dim
        self.rho = 0.05
        self.R_distr = InverseWishart(dim, dim+1, corr_mat=True)

        self.cov_sigma = cov_sigma
        self.L_distr = InverseWishart(dim, dim+1,  cov_sigma * np.eye(dim), corr_mat=False)

        if s_init is None:
            s_init = np.zeros((self.K, dim), dtype=float)
            for i in range(self.K):
                S = np.random.randn(dim)/5
                s_init[i] = S
        else:
            assert s_init.shape == (self.K, dim), f"provided s_init have\
                incorrect shape: {s_init.shape=} instead of {(self.K, dim)}"

        self.S_leaves: S_handler = S_handler(s_init)
        self.weights = np.zeros((self.K, dim), dtype=float)

        self._trained = False

    def get_weights(self):
        if not self._trained:
            raise Warning("first call fit() method of trainer.")
        return self.weights

    def fit(self, n_steps=100):
        assert n_steps > 0 and isinstance(n_steps, int), "number of steps should be positive integer"

        # first setup the weights as in simple linear regression
        self._update_weights(
            tasks=self.tasks,
            weights_mp=self.weights,
            S_leaves=None,
            R=None,
            K=self.K,
            rho=self.rho,
            cov=np.eye(self.dim)
        )

        # method iteration
        for _ in range(n_steps):
            # get the most probable parameters
            R = self.R_distr.get_most_prob()
            L = self.L_distr.get_most_prob()


        # inference on coalescent tree.
        # Integrate out the evoulution of covariance
            leaves = [Item(elem, cov=0) for elem in self.S_leaves.s_list]
            coalescent_tree = CoalescentTree(leaves, L, self.dim)

        # gradient optimization of S_leaves
        # based on generated tree and parameters
            self.S_leaves.update_param(R, L, self.weights, coalescent_tree)

        # update the posteriors based on new tree and weights
            # update covariance matrix posterior
            L_samples = optimal_cov(coalescent_tree, self.dim)
            self.L_distr.update_posterior(L_samples)


            # update correlation matrix_posterior
            R_samples = optimal_R(self.S_leaves.s_list, self.weights,)
            self.R_distr.update_posterior(R_samples)

            # update weights based on new posterior
            self._update_weights(
                tasks=self.tasks,
                weights_mp=self.weights,
                S_leaves=self.S_leaves.s_list,
                R=R,
                K=self.K,
                rho=self.rho,
                cov=None
            )

        self._trained = True

    def _update_weights(self, tasks, weights_mp, S_leaves, R, K, rho, cov=None):
        for i in range(K):
            if cov is None:
                S = np.diag(S_leaves[i])
                cov_i = np.exp(S) @ R @ np.exp(S)
            else:
                cov_i = cov

            w = tasks[i].most_prob_w(cov_i, rho)
            weights_mp[i] = w

`init(tasks, dim, rho=0.05, cov_sigma=0.1, s_init=None)`

:param tasks: list of TaskData, which will be learned :param dim: dimention of problems :param rho: parameter that scales noise level in labels :param cov_sigma: covariance matrix scaler for Coalescent evolution variation :param s_init: initial values for S_i: variance scales of data

Source code in bmm_multitask_learning/coalescent/coalescent_learner.py

def __init__(self,
        tasks: list[TaskData],
        dim,
        rho=0.05,
        cov_sigma=0.1,
        s_init=None
    ):
    """

    :param tasks: list of TaskData, which will be learned
    :param dim: dimention of problems
    :param rho: parameter that scales noise level in labels
    :param cov_sigma: covariance matrix scaler for Coalescent evolution variation
    :param s_init: initial values for S_i: variance scales of data
    """

    self.tasks = tasks
    self.K = len(tasks)
    self.dim = dim
    self.rho = 0.05
    self.R_distr = InverseWishart(dim, dim+1, corr_mat=True)

    self.cov_sigma = cov_sigma
    self.L_distr = InverseWishart(dim, dim+1,  cov_sigma * np.eye(dim), corr_mat=False)

    if s_init is None:
        s_init = np.zeros((self.K, dim), dtype=float)
        for i in range(self.K):
            S = np.random.randn(dim)/5
            s_init[i] = S
    else:
        assert s_init.shape == (self.K, dim), f"provided s_init have\
            incorrect shape: {s_init.shape=} instead of {(self.K, dim)}"

    self.S_leaves: S_handler = S_handler(s_init)
    self.weights = np.zeros((self.K, dim), dtype=float)

    self._trained = False

`S_handler`

class to incapsulate the methods for updating covariance scalers S

Source code in bmm_multitask_learning/coalescent/coalescent_learner.py

class S_handler:
    """
    class to incapsulate the methods for updating covariance scalers S
    """
    def __init__(self, s_list):
        self.s_list = s_list
        self.K = len(self.s_list)

    def log_prob_S(self, s, R_inv, L_inv, P, W):
        """
        returns the log probability of proposed parameters for S
        """
        s_neg = -s
        s_max = np.max(s_neg)
        s_neg = s_neg - s_max
        s_exp = np.exp(s_neg) * np.exp(s_max)
        if len(s_exp.shape) > 1:
            s_exp = s_exp.squeeze()
        S_exp = np.diag(s_exp)

        S = np.diag(s)

        cov_m = S_exp @ R_inv @ S_exp
        return - np.trace(S) - 1/2 * np.trace((S - P) @ L_inv @ (S - P)) - 1/2 * np.trace(W @ cov_m @ W)

    def _optimize_s(self, R, L, w, p, s_0, verbose=False):
        """
        This method is to optimize for given parameters
        """
        L_inv = np.linalg.inv(L)
        R_inv = np.linalg.inv(R)
        W = np.diag(w)
        P = np.diag(p)
        d = R.shape[0]

        def grad_S(s):
            s_neg = -s
            s_max = np.max(s_neg)
            s_neg = s_neg - s_max
            s_exp = np.exp(s_neg) * np.exp(s_max)
            if len(s_exp.shape) > 1:
                s_exp = s_exp.squeeze()
            S_exp = np.diag(s_exp)

            S = np.diag(s)

            cov_m = S_exp @ R_inv @ S_exp
            return (- np.eye(d) - (S - P) @ L_inv + W @ cov_m @ W).diagonal()

        if verbose:
            print(f"[INFO] log prob start: {self.log_prob_S(s_0, R_inv, L_inv, P, W)}")

        optimizer = grad_optimizer(100, 0.001, grad_S)
        s_opt = optimizer.run(s_0)

        if verbose:
            print(f"[INFO] log prob trained: {self.log_prob_S(s_opt)}")

        return s_opt

    def update_param(self, R, L, weights,
                    coalescent_tree: CoalescentTree,
                    verbose=False):
        for i in range(self.K):
            parent_s = coalescent_tree.leaves[i].parent.mean
            w = weights[i]
            s_0 = self.s_list[i]
            self.s_list[i] = self._optimize_s(R, L, w, parent_s, s_0, verbose)

`log_prob_S(s, R_inv, L_inv, P, W)`

returns the log probability of proposed parameters for S

Source code in bmm_multitask_learning/coalescent/coalescent_learner.py

def log_prob_S(self, s, R_inv, L_inv, P, W):
    """
    returns the log probability of proposed parameters for S
    """
    s_neg = -s
    s_max = np.max(s_neg)
    s_neg = s_neg - s_max
    s_exp = np.exp(s_neg) * np.exp(s_max)
    if len(s_exp.shape) > 1:
        s_exp = s_exp.squeeze()
    S_exp = np.diag(s_exp)

    S = np.diag(s)

    cov_m = S_exp @ R_inv @ S_exp
    return - np.trace(S) - 1/2 * np.trace((S - P) @ L_inv @ (S - P)) - 1/2 * np.trace(W @ cov_m @ W)

`inverse_wishart`

`InverseWishart`

class to handle methods of InverseWishart distribution

Source code in bmm_multitask_learning/coalescent/inverse_wishart.py

class InverseWishart:
    """
    class to handle methods of InverseWishart distribution
    """
    def __init__(self, d, degrees_of_freedom, init_cov=None, corr_mat=False):
        """
        :param d: dimension of the covariance matrix
        :param degrees_of_freedom: degrees of freedom of the
            inverse Wishart distribution
        :param init_cov: initial covariance matrix
        :param corr_mat: if True, the covariance matrix is a correlation
            matrix (i.e., it has ones on the diagonal)
        """
        self.d = d
        if init_cov is None:
            init_cov = np.eye(d)
        else:
            assert init_cov.shape == (d, d)

        self.cov = init_cov
        self.degrees_of_freedom = degrees_of_freedom
        self.corr_mat = corr_mat

    @staticmethod
    def cov2corr(cov):
        """
        Conver covariance matrox to correlation matrix
        """
        if len(cov.shape) == 0:
            cov = cov[None]

        std_devs = 1/np.sqrt(np.diag(cov))
        D = np.diag(std_devs)

        d = len(cov.shape)

        if d == 1:
            D = D[None]
            cov = cov[None]
        R = D @ cov @ D
        return R

    def update_posterior(self, samples):
        """
        :param samples: shape=(n x d)
        """
        assert len(samples.shape) == 2 and samples.shape[1] == self.d
        K = samples.shape[0]
        self.degrees_of_freedom += K

        S = samples.T @ samples  # d x d
        self.cov += S
        return

    def get_most_prob(self):
        """
        returns the mode of the inverse Wishart distribution
        """
        cov = self.cov
        cov = cov/(self.degrees_of_freedom + self.d + 1)

        if self.corr_mat:
            cov = InverseWishart.cov2corr(cov)
        return cov

`init(d, degrees_of_freedom, init_cov=None, corr_mat=False)`

:param d: dimension of the covariance matrix :param degrees_of_freedom: degrees of freedom of the inverse Wishart distribution :param init_cov: initial covariance matrix :param corr_mat: if True, the covariance matrix is a correlation matrix (i.e., it has ones on the diagonal)

Source code in bmm_multitask_learning/coalescent/inverse_wishart.py

def __init__(self, d, degrees_of_freedom, init_cov=None, corr_mat=False):
    """
    :param d: dimension of the covariance matrix
    :param degrees_of_freedom: degrees of freedom of the
        inverse Wishart distribution
    :param init_cov: initial covariance matrix
    :param corr_mat: if True, the covariance matrix is a correlation
        matrix (i.e., it has ones on the diagonal)
    """
    self.d = d
    if init_cov is None:
        init_cov = np.eye(d)
    else:
        assert init_cov.shape == (d, d)

    self.cov = init_cov
    self.degrees_of_freedom = degrees_of_freedom
    self.corr_mat = corr_mat

`cov2corr(cov)` `staticmethod`

Conver covariance matrox to correlation matrix

Source code in bmm_multitask_learning/coalescent/inverse_wishart.py

@staticmethod
def cov2corr(cov):
    """
    Conver covariance matrox to correlation matrix
    """
    if len(cov.shape) == 0:
        cov = cov[None]

    std_devs = 1/np.sqrt(np.diag(cov))
    D = np.diag(std_devs)

    d = len(cov.shape)

    if d == 1:
        D = D[None]
        cov = cov[None]
    R = D @ cov @ D
    return R

`get_most_prob()`

returns the mode of the inverse Wishart distribution

Source code in bmm_multitask_learning/coalescent/inverse_wishart.py

def get_most_prob(self):
    """
    returns the mode of the inverse Wishart distribution
    """
    cov = self.cov
    cov = cov/(self.degrees_of_freedom + self.d + 1)

    if self.corr_mat:
        cov = InverseWishart.cov2corr(cov)
    return cov

`update_posterior(samples)`

:param samples: shape=(n x d)

Source code in bmm_multitask_learning/coalescent/inverse_wishart.py

def update_posterior(self, samples):
    """
    :param samples: shape=(n x d)
    """
    assert len(samples.shape) == 2 and samples.shape[1] == self.d
    K = samples.shape[0]
    self.degrees_of_freedom += K

    S = samples.T @ samples  # d x d
    self.cov += S
    return

`parameters_cov`

`optimal_cov(coalesce_tree, dim)`

retunrns samples from covariance matrix for InverseWishart distribution parameters update

Source code in bmm_multitask_learning/coalescent/parameters_cov.py

def optimal_cov(coalesce_tree: CoalescentTree, dim):
    """
    retunrns samples from covariance matrix
    for InverseWishart distribution parameters update
    """
    sample_mean = np.zeros((dim, ), dtype=float)
    overall_items = 0

    # compute sample mean
    for item in coalesce_tree.iterate_over():
        parent = item.parent
        if parent is not None:
            overall_items += 1
            dt = item.t - parent.t
            D_i = item.mean - parent.mean
            x_i = D_i / np.sqrt(dt)
            sample_mean += x_i
    sample_mean /= overall_items


    # compute samples
    samples = []

    for item in coalesce_tree.iterate_over():
        parent = item.parent
        if parent is not None:
            dt = item.t - parent.t
            D_i = item.mean - parent.mean
            x_i = D_i / np.sqrt(dt)
            samples.append(x_i - sample_mean)

    return np.array(samples)

`plot_coalescent`

`task_classes`

`TaskData` `dataclass`

Source code in bmm_multitask_learning/coalescent/task_classes.py

@dataclass
class TaskData:
    X: np.ndarray  # (n_task, d)
    y: np.ndarray  # (n_task, 1)

    def most_prob_w(self, sigma, rho):
        """
        :param sigma: cov matrix of prior of w
        :param rho: scaling coefficient for data
        """
        coeff = (1/rho**2)
        mean_est_uncent = coeff * (self.X * self.y).sum(0)
        cov_fixed = np.linalg.inv(np.linalg.inv(sigma) +
                            coeff * (self.X.T @ self.X))

        mean_est = cov_fixed @ (mean_est_uncent)
        return mean_est

`most_prob_w(sigma, rho)`

:param sigma: cov matrix of prior of w :param rho: scaling coefficient for data

Source code in bmm_multitask_learning/coalescent/task_classes.py

def most_prob_w(self, sigma, rho):
    """
    :param sigma: cov matrix of prior of w
    :param rho: scaling coefficient for data
    """
    coeff = (1/rho**2)
    mean_est_uncent = coeff * (self.X * self.y).sum(0)
    cov_fixed = np.linalg.inv(np.linalg.inv(sigma) +
                        coeff * (self.X.T @ self.X))

    mean_est = cov_fixed @ (mean_est_uncent)
    return mean_est

Reference for coalescent approach in multitask learning

bmm_multitask_learning.coalescent

coalescent_inference

CoalescentTree

iterate_over()

select_candidates(x, cov, t_m1, n, dim) staticmethod

Item dataclass

coalesce(first, other, t) staticmethod

optimal_t(first, other, cov, i, t_m1, n, dim) staticmethod

coalescent_learner

MultitaskProblem

__init__(tasks, dim, rho=0.05, cov_sigma=0.1, s_init=None)

S_handler

log_prob_S(s, R_inv, L_inv, P, W)

inverse_wishart

InverseWishart

__init__(d, degrees_of_freedom, init_cov=None, corr_mat=False)

cov2corr(cov) staticmethod

get_most_prob()

update_posterior(samples)

parameters_cov

optimal_cov(coalesce_tree, dim)

plot_coalescent

task_classes

TaskData dataclass

most_prob_w(sigma, rho)

utils

`bmm_multitask_learning.coalescent`

`coalescent_inference`

`CoalescentTree`

`iterate_over()`

`select_candidates(x, cov, t_m1, n, dim)` `staticmethod`

`Item` `dataclass`

`coalesce(first, other, t)` `staticmethod`

`optimal_t(first, other, cov, i, t_m1, n, dim)` `staticmethod`

`coalescent_learner`

`MultitaskProblem`

`init(tasks, dim, rho=0.05, cov_sigma=0.1, s_init=None)`

`S_handler`

`log_prob_S(s, R_inv, L_inv, P, W)`

`inverse_wishart`

`InverseWishart`

`init(d, degrees_of_freedom, init_cov=None, corr_mat=False)`

`cov2corr(cov)` `staticmethod`

`get_most_prob()`

`update_posterior(samples)`

`parameters_cov`

`optimal_cov(coalesce_tree, dim)`

`plot_coalescent`

`task_classes`

`TaskData` `dataclass`

`most_prob_w(sigma, rho)`

`utils`