Skip to content

featureset

FeatureSet(filenames=None, filenameprefix=None, wildcard='*{}*', verbose=False)

A class to create a array based on trajectory features. This array is later on used to calculate similarities between trajectories.

The class loads trajectory-specific features and organizes them into arrays for further analysis. It assumes that all files have the same shape. The order in which they are loaded needs to be specified.

Initialize the FeatureSet.

Parameters:

  • filenames (Union[List[str], ndarray], default: None ) –

    A list or array of filenames.

  • filenameprefix (Union[List[str], ndarray], default: None ) –

    A list or array of names that contain the running number. Must be used together with wildcard.

  • wildcard (str, default: '*{}*' ) –

    A wildcard string pattern for generating filenames in combination with the running number from filenameprefix. Must be provided if filenameprefix is used, by default '{}'.

  • verbose (bool, default: False ) –

    Whether to print logs during execution, by default False.

Examples:

>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
Source code in src/dcTMD/featureset.py
@beartype
def __init__(
    self,
    filenames: Union[List[str], np.ndarray] = None,
    filenameprefix: Union[List[str], np.ndarray] = None,
    wildcard: str = '*{}*',
    verbose: bool = False
) -> None:
    """
    Initialize the FeatureSet.

    Parameters
    ----------
    filenames :
        A list or array of filenames.
    filenameprefix :
        A list or array of names that contain the running number.
        Must be used together with wildcard.
    wildcard :
        A wildcard string pattern for generating filenames
        in combination with the running number from filenameprefix.
        Must be provided if filenameprefix is used, by default '*{}*'.
    verbose : bool, optional
        Whether to print logs during execution, by default False.

    Examples
    --------
    >>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
    >>> featureset = FeatureSet(filenames, verbose=True)
    >>> featureset.fill_array()
    >>> array = featureset.array
    >>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
    """
    self.verbose = verbose

    if filenames is not None:
        self.filenames = np.asarray(filenames)
    elif filenameprefix is not None and wildcard:
        self.filenames = self._get_filenames(filenameprefix, wildcard)
    else:
        raise ValueError(
            'Either `filenames` must be provided directly, '
            'or both `filenameprefix` and `wildcard` '
            'must be provided together.'
        )
    if self.verbose:
        print(f'Loaded filenames: {self.filenames}')

fill_array()

Load the data from trajectory files into a NumPy array.

This method reads each file in self.filenames and fills the data into a pre-allocated NumPy array based on the file shape determined by _read_testfile.

Files that cannot be loaded or its shape does not match the expected shape, are skipped.

Returns:

  • ndarray

    A array where each entry corresponds to the data from a trajectory file.

Examples:

>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
Source code in src/dcTMD/featureset.py
def fill_array(self) -> np.ndarray:
    """
    Load the data from trajectory files into a NumPy array.

    This method reads each file in self.filenames and fills the data into a pre-allocated NumPy array based on the file shape determined by `_read_testfile`.

    Files that cannot be loaded or its shape does not match the expected shape, are skipped.

    Returns
    -------
    np.ndarray
        A array where each entry corresponds to the data from a trajectory file.

    Examples
    --------
    >>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
    >>> featureset = FeatureSet(filenames, verbose=True)
    >>> featureset.fill_array()
    >>> array = featureset.array
    >>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
    """
    self._read_testfile()
    array = np.zeros(shape=(
        len(self.filenames),
        *self.fileshape,
    ))
    self.names_ = []

    with tqdm(
        total=len(self.filenames),
        desc='Loading files',
    ) as pbar:
        for i, current_fname in enumerate(self.filenames):
            if self.verbose:
                print(f'Reading file {current_fname}')

            file_data = self._safe_loadtxt(current_fname)

            if file_data.shape != self.fileshape:
                print(
                    f'Skipping file {current_fname} due to shape mismatch')
                continue

            array[i, :] = file_data
            self.names_.append(current_fname)
            pbar.update(1)

    self.array = array
    return self.array