Source code for fame3r.score

import numpy as np
from sklearn.base import BaseEstimator
from sklearn.utils._set_output import _SetOutputMixin
from sklearn.utils.validation import check_is_fitted, validate_data

__all__ = ["FAME3RScoreEstimator"]



[docs]
class FAME3RScoreEstimator(BaseEstimator, _SetOutputMixin):
    """Computes the FAME score for a set of features.

    The FAME score is defined as the mean Tanimoto similarity of the feature
    vector to the ``n`` closest vectors in the training set.

    It is intended for this estimator to only be used with binary feature
    ("fingerprint") vectors, as Tanimoto similarity is not well-behaved on
    arbitrary vectors.

    Parameters
    ----------
    n_neighbors : int, default=3

        Number of nearest neigbors to consider during FAME score
        calculation. Defaults to 3, as defined in the original paper.

    Examples
    --------
    >>> from fame3r import FAME3RVectorizer, FAME3RScoreEstimator
    >>> from sklearn.pipeline import make_pipeline
    >>> pipeline = make_pipeline(
    >>>    FAME3RVectorizer(output=["fingerprint"]),
    >>>    FAME3RScoreEstimator()
    >>> ).fit([["CC[C:1]"], ["CC[N:1]"], ["CC[O:1]"]])
    >>> pipeline.predict([["[C:1]CC"]])
    array([0.66666667])
    """

    def __init__(self, n_neighbors: int = 3):
        self.n_neighbors: int = n_neighbors


[docs]
    def fit(self, X, y=None):
        """Fit the estimator to the training set of known samples.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        y : (ignored)
            Not used, present for API consistency by convention.

        Returns
        -------
        self : object
            FAME3RVectorizer class instance.
        """

        X = validate_data(
            self,
            X,
            dtype="numeric",
            ensure_2d=True,
            ensure_min_samples=self.n_neighbors,
            estimator=FAME3RScoreEstimator,
        )

        self._reference_data = X

        return self



[docs]
    def predict(self, X):
        """Compute the FAME score of the given samples.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Query data.

        Returns
        -------
        y : ndarray of shape (n_samples,)
            The predicted FAME scores.
        """
        check_is_fitted(self)
        X = validate_data(
            self,
            X,
            reset=False,
            dtype="numeric",
            ensure_2d=True,
            ensure_min_samples=0,
            estimator=FAME3RScoreEstimator,
        )

        return np.concat(
            [
                _fame_score(self._reference_data, X_batch, n_neighbors=self.n_neighbors)
                for X_batch in np.array_split(X, 100)
            ]
        )



[docs]
    def get_feature_names_out(self, input_features=None):
        """Get output feature names for transformation.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Not used, present here for API consistency by convention.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        """
        return ["FAME3RScore"]




def _fame_score(reference, X, *, n_neighbors):
    similarity_matrix = _tanimoto_similarity_matrix(reference, X)

    return np.mean(
        np.sort(similarity_matrix, axis=0)[-n_neighbors:],
        axis=0,
    )


def _tanimoto_similarity_matrix(A, B):
    A = np.asarray(A)
    B = np.asarray(B)

    intersection = np.matmul(A, B.T)
    A_square_norm = np.sum(A**2, axis=1)
    B_square_norm = np.sum(B**2, axis=1)
    union = A_square_norm[:, None] + B_square_norm[None, :] - intersection

    return np.divide(
        intersection,
        union,
        out=np.zeros_like(intersection, dtype=np.float64),
        where=union != 0,
    )