A class to create a array based on trajectory features.
This array is later on used to calculate similarities
between trajectories.
The class loads trajectory-specific features and organizes
them into arrays for further analysis. It assumes that all files
have the same shape.
The order in which they are loaded needs to be specified.
Initialize the FeatureSet.
Parameters:
-
filenames
(Union[List[str], ndarray], default:
None
)
–
A list or array of filenames.
-
filenameprefix
(Union[List[str], ndarray], default:
None
)
–
A list or array of names that contain the running number.
Must be used together with wildcard.
-
wildcard
(str, default:
'*{}*'
)
–
A wildcard string pattern for generating filenames
in combination with the running number from filenameprefix.
Must be provided if filenameprefix is used, by default '{}'.
-
verbose
(bool, default:
False
)
–
Whether to print logs during execution, by default False.
Examples:
>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
Source code in src/dcTMD/featureset.py
| @beartype
def __init__(
self,
filenames: Union[List[str], np.ndarray] = None,
filenameprefix: Union[List[str], np.ndarray] = None,
wildcard: str = '*{}*',
verbose: bool = False
) -> None:
"""
Initialize the FeatureSet.
Parameters
----------
filenames :
A list or array of filenames.
filenameprefix :
A list or array of names that contain the running number.
Must be used together with wildcard.
wildcard :
A wildcard string pattern for generating filenames
in combination with the running number from filenameprefix.
Must be provided if filenameprefix is used, by default '*{}*'.
verbose : bool, optional
Whether to print logs during execution, by default False.
Examples
--------
>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
"""
self.verbose = verbose
if filenames is not None:
self.filenames = np.asarray(filenames)
elif filenameprefix is not None and wildcard:
self.filenames = self._get_filenames(filenameprefix, wildcard)
else:
raise ValueError(
'Either `filenames` must be provided directly, '
'or both `filenameprefix` and `wildcard` '
'must be provided together.'
)
if self.verbose:
print(f'Loaded filenames: {self.filenames}')
|
fill_array()
Load the data from trajectory files into a NumPy array.
This method reads each file in self.filenames and fills the data into a pre-allocated NumPy array based on the file shape determined by _read_testfile.
Files that cannot be loaded or its shape does not match the expected shape, are skipped.
Returns:
-
ndarray
–
A array where each entry corresponds to the data from a trajectory file.
Examples:
>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
Source code in src/dcTMD/featureset.py
| def fill_array(self) -> np.ndarray:
"""
Load the data from trajectory files into a NumPy array.
This method reads each file in self.filenames and fills the data into a pre-allocated NumPy array based on the file shape determined by `_read_testfile`.
Files that cannot be loaded or its shape does not match the expected shape, are skipped.
Returns
-------
np.ndarray
A array where each entry corresponds to the data from a trajectory file.
Examples
--------
>>> filenames = ['traj_01_contacts.txt', 'traj_02_contacts.txt']
>>> featureset = FeatureSet(filenames, verbose=True)
>>> featureset.fill_array()
>>> array = featureset.array
>>> 100%|████████████| X/X [XX:XX<00:00, X.XXit/s] # noqa
"""
self._read_testfile()
array = np.zeros(shape=(
len(self.filenames),
*self.fileshape,
))
self.names_ = []
with tqdm(
total=len(self.filenames),
desc='Loading files',
) as pbar:
for i, current_fname in enumerate(self.filenames):
if self.verbose:
print(f'Reading file {current_fname}')
file_data = self._safe_loadtxt(current_fname)
if file_data.shape != self.fileshape:
print(
f'Skipping file {current_fname} due to shape mismatch')
continue
array[i, :] = file_data
self.names_.append(current_fname)
pbar.update(1)
self.array = array
return self.array
|