31 #include "density_clustering_cuda.hpp" 42 const std::size_t n_rows,
43 const std::size_t n_cols,
49 const int BOX_DIM_1 = 0;
50 const int BOX_DIM_2 = 1;
54 float min_x1=0.0f, max_x1=0.0f, min_x2=0.0f, max_x2=0.0f;
55 min_x1=coords[0*n_cols+BOX_DIM_1];
56 max_x1=coords[0*n_cols+BOX_DIM_1];
58 min_x2=coords[0*n_cols+BOX_DIM_2];
59 max_x2=coords[0*n_cols+BOX_DIM_2];
62 for (std::size_t i=1; i < n_rows; ++i) {
63 min_x1 = std::min(min_x1, coords[i*n_cols+BOX_DIM_1]);
64 max_x1 = std::max(max_x1, coords[i*n_cols+BOX_DIM_1]);
66 min_x2 = std::min(min_x2, coords[i*n_cols+BOX_DIM_2]);
67 max_x2 = std::max(max_x2, coords[i*n_cols+BOX_DIM_2]);
71 grid.
n_boxes.push_back((max_x1 - min_x1) / radius + 1);
73 grid.
n_boxes.push_back((max_x2 - min_x2) / radius + 1);
78 for (std::size_t i=0; i < n_rows; ++i) {
79 int i_box_1=0, i_box_2=0;
80 i_box_1 = (coords[i*n_cols+BOX_DIM_1] - min_x1) / radius;
82 i_box_2 = (coords[i*n_cols+BOX_DIM_2] - min_x2) / radius;
92 return {center[0] +
BOX_DIFF[i_neighbor][0]
93 , center[1] + BOX_DIFF[i_neighbor][1]};
106 std::vector<std::size_t>
108 const std::size_t n_rows,
109 const std::size_t n_cols,
110 const float radius) {
111 std::vector<float> radii = {radius};
113 std::map<float, std::vector<std::size_t>> pop_map =
114 Clustering::Density::CUDA::calculate_populations(coords
119 std::map<float, std::vector<std::size_t>> pop_map =
122 return pop_map[radius];
125 std::map<float, std::vector<std::size_t>>
127 const std::size_t n_rows,
128 const std::size_t n_cols,
129 std::vector<float> radii) {
130 std::map<float, std::vector<std::size_t>> pops;
131 for (
float rad: radii) {
132 pops[rad].resize(n_rows, 1);
134 std::sort(radii.begin(), radii.end(), std::greater<float>());
135 std::size_t n_radii = radii.size();
136 std::vector<float> rad2(n_radii);
137 for (std::size_t i=0; i < n_radii; ++i) {
138 rad2[i] = radii[i]*radii[i];
141 std::size_t i, j, k, l, ib;
153 std::vector<int> box_buffer;
154 #pragma omp parallel for default(none)\ 155 private(i,box,box_buffer,center,i_neighbor,ib,dist,j,k,l,c)\ 156 firstprivate(n_rows,n_cols,n_radii,radii,rad2,N_NEIGHBOR_BOXES)\ 157 shared(coords,pops,grid)\ 158 schedule(dynamic,1024) 159 for (i=0; i < n_rows; ++i) {
165 box_buffer = grid.
boxes[box];
167 for (ib=0; ib < box_buffer.size(); ++ib) {
171 #pragma simd reduction(+:dist) 172 for (k=0; k < n_cols; ++k) {
173 c = coords[i*n_cols+k] - coords[j*n_cols+k];
176 for (l=0; l < n_radii; ++l) {
177 if (dist < rad2[l]) {
179 pops[radii[l]][i] += 1;
181 pops[radii[l]][j] += 1;
199 const std::size_t n_frames = pops.size();
200 const float max_pop = (float) (*std::max_element(pops.begin()
202 std::vector<float> fe(n_frames);
203 #pragma omp parallel for default(none)\ 205 firstprivate(max_pop, n_frames)\ 207 for (i=0; i < n_frames; ++i) {
208 fe[i] = (float) -1.0f * log(pops[i]/max_pop);
213 std::vector<FreeEnergy>
215 std::vector<FreeEnergy> fe_sorted;
216 for (std::size_t i=0; i < fe.size(); ++i) {
221 std::sort(fe_sorted.begin(),
224 return d1.second < d2.second;
229 std::tuple<Neighborhood, Neighborhood>
231 const std::size_t n_rows,
232 const std::size_t n_cols,
233 const std::vector<float>& free_energy) {
240 for (std::size_t i=0; i < n_rows; ++i) {
242 , std::numeric_limits<float>::max());
244 , std::numeric_limits<float>::max());
247 std::size_t i, j, c, min_j, min_j_high_dens;
248 float dist, d, mindist, mindist_high_dens;
250 #pragma omp parallel for default(none) \ 251 private(i,j,c,dist,d,mindist,mindist_high_dens,min_j,min_j_high_dens)\ 252 firstprivate(n_rows,n_cols) \ 253 shared(coords,nh,nh_high_dens,free_energy) \ 254 schedule(dynamic, 2048) 255 for (i=0; i < n_rows; ++i) {
256 mindist = std::numeric_limits<float>::max();
257 mindist_high_dens = std::numeric_limits<float>::max();
259 min_j_high_dens = n_rows+1;
260 for (j=0; j < n_rows; ++j) {
263 #pragma simd reduction(+:dist) 264 for (c=0; c < n_cols; ++c) {
265 d = coords[i*n_cols+c] - coords[j*n_cols+c];
269 if (dist < mindist) {
274 if (free_energy[j] < free_energy[i]
275 && dist < mindist_high_dens) {
276 mindist_high_dens = dist;
283 nh_high_dens[i] =
Neighbor(min_j_high_dens
284 , mindist_high_dens);
286 return std::make_tuple(nh, nh_high_dens);
291 std::set<std::size_t>
293 const std::size_t n_cols,
294 const std::vector<FreeEnergy>& sorted_fe,
295 const std::size_t i_frame,
296 const std::size_t limit,
297 const float max_dist) {
300 std::vector<int> frame_in_nh(limit, 0);
301 std::set<std::size_t> nh;
303 const std::size_t i_frame_sorted = sorted_fe[i_frame].first * n_cols;
306 #pragma omp parallel for default(none)\ 307 private(j,c,d,dist2)\ 308 firstprivate(i_frame,i_frame_sorted,limit,max_dist,n_cols)\ 309 shared(coords,sorted_fe,frame_in_nh) 310 for (j=0; j < limit; ++j) {
313 #pragma simd reduction(+:dist2) 314 for (c=0; c < n_cols; ++c) {
315 d = coords[i_frame_sorted+c] - coords[sorted_fe[j].first*n_cols+c];
318 if (dist2 < max_dist) {
324 for (j=0; j < limit; ++j) {
325 if (frame_in_nh[j] > 0) {
336 for (
auto match: nh) {
339 sigma2 += match.second.second;
341 return (sigma2 / nh.size());
344 std::vector<std::size_t>
347 const std::vector<float>& free_energy) {
349 std::vector<std::size_t> clustering(initial_clustering);
350 for (
const auto& fe: fe_sorted) {
351 std::size_t
id = fe.first;
352 if (clustering[
id] == 0) {
353 std::size_t neighbor_id = nh_high_dens.find(
id)->second.first;
355 clustering[id] = clustering[neighbor_id];
363 ,
const std::size_t first_frame_above_threshold
364 ,
const std::vector<FreeEnergy>& fe_sorted) {
365 logger(std::cout) <<
"sigma2: " 368 << first_frame_above_threshold
369 <<
" frames with low free energy / high density" 371 <<
"first frame above threshold has free energy: " 372 << fe_sorted[first_frame_above_threshold].second
374 <<
"merging initial clusters" 381 std::tuple<std::vector<std::size_t>
384 , std::vector<FreeEnergy>
385 , std::set<std::size_t>
389 ,
const float free_energy_threshold
390 ,
const std::size_t n_rows
391 ,
const std::vector<std::size_t> initial_clusters) {
392 std::vector<std::size_t> clustering;
393 bool have_initial_clusters = (initial_clusters.size() == n_rows);
394 if (have_initial_clusters) {
395 clustering = initial_clusters;
397 clustering = std::vector<std::size_t>(n_rows);
402 auto lb = std::upper_bound(fe_sorted.begin()
407 return d1.second < d2.second;
409 std::size_t first_frame_above_threshold = (lb - fe_sorted.begin());
416 std::size_t distinct_name = *std::max_element(clustering.begin(), clustering.end());
417 std::set<std::size_t> visited_frames = {};
418 if (have_initial_clusters) {
421 for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
422 std::size_t i_original = fe_sorted[i].first;
423 if (initial_clusters[i_original] != 0) {
424 visited_frames.insert(i);
428 return std::make_tuple(clustering
429 , first_frame_above_threshold
436 std::vector<std::size_t>
438 , std::vector<std::size_t> clustering
439 , std::vector<FreeEnergy>& fe_sorted) {
440 std::set<std::size_t> final_names;
441 for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
442 final_names.insert(clustering[fe_sorted[i].first]);
444 std::map<std::size_t, std::size_t> old_to_new;
446 std::size_t new_name=0;
447 for (
auto name: final_names) {
448 old_to_new[name] = ++new_name;
451 for(
auto& elem: clustering) {
452 elem = old_to_new[elem];
457 std::vector<std::size_t>
460 std::size_t n_frames = clustering.size();
464 CounterClustMap counts;
465 for (std::size_t i=0; i < n_frames; ++i) {
466 CounterClustMap::iterator it(counts.find(clustering[i]));
467 if (it != counts.end()){
470 counts[clustering[i]] = 1;
475 std::vector<std::pair<std::size_t,std::size_t>> counts_vec;
476 std::copy(counts.begin(), counts.end(), std::back_inserter(counts_vec));
481 CounterClustMap MapNames;
482 for(std::size_t i = 0; i < counts_vec.size(); ++i) {
483 MapNames[counts_vec[i].first] = counts_vec.size() - i;
487 std::vector<std::size_t> remapped_clustering(n_frames);
488 for (std::size_t i = 0; i < n_frames; ++i) {
489 remapped_clustering[i] = MapNames[clustering[i]];
491 return remapped_clustering;
495 compare2DVector(
const std::pair<std::size_t,std::size_t> &p1,
const std::pair<std::size_t,std::size_t> &p2) {
496 return p1.second < p2.second;
501 float val_2digits = (int)(val * 100) / 100.0;
502 return val_2digits == val;
507 , std::size_t& distinct_name
508 , std::vector<std::size_t>& clustering
509 ,
const std::vector<FreeEnergy>& fe_sorted
510 , std::size_t first_frame_above_threshold) {
511 bool neighboring_clusters_merged =
true;
514 std::set<std::size_t> cluster_names;
515 for (
auto j: local_nh) {
516 cluster_names.insert(clustering[fe_sorted[j].first]);
518 if ( ! (cluster_names.size() == 1
519 && cluster_names.count(0) != 1)) {
520 neighboring_clusters_merged =
false;
522 if (cluster_names.count(0) == 1) {
523 cluster_names.erase(0);
525 std::size_t common_name;
526 if (cluster_names.size() > 0) {
531 common_name = (*cluster_names.begin());
535 common_name = ++distinct_name;
537 for (
auto j: local_nh) {
538 clustering[fe_sorted[j].first] = common_name;
541 #pragma omp parallel for default(none) private(j,ndx)\ 542 firstprivate(common_name,\ 543 first_frame_above_threshold,\ 545 shared(clustering,fe_sorted) 546 for (j=0; j < first_frame_above_threshold; ++j) {
547 ndx = fe_sorted[j].first;
548 if (cluster_names.count(clustering[ndx]) == 1) {
549 clustering[ndx] = common_name;
553 return neighboring_clusters_merged;
559 main(boost::program_options::variables_map args) {
564 const std::string input_file = args[
"file"].as<std::string>();
570 std::tie(coords, n_rows, n_cols) = read_coords<float>(input_file);
571 std::string header_comment = args[
"header"].as<std::string>();
572 std::map<std::string,float> commentsMap = args[
"commentsMap"].as<std::map<std::string,float>>();
574 std::vector<float> free_energies;
576 if (args.count(
"input") && (args.count(
"free-energy") || args.count(
"nearest-neighbors"))) {
577 std::cerr <<
"error: for input (-i) -D/-B should be used." << std::endl;
582 if (args.count(
"free-energy-input")) {
584 << args[
"free-energy-input"].as<std::string>()
586 if (args.count(
"radii") || args.count(
"radius")) {
589 if (args.count(
"free-energy") || args.count(
"population")) {
594 read_comments(args[
"free-energy-input"].as<std::string>(), commentsMap);
595 }
else if (args.count(
"free-energy") || args.count(
"population") || args.count(
"output")) {
596 if (args.count(
"radii")) {
597 Clustering::logger(std::cout) <<
" calculating free energy and population" << std::endl;
599 if (args.count(
"output")) {
600 std::cerr <<
"error: clustering cannot be done with several radii (-R is set)." << std::endl;
603 if ( ! (args.count(
"population") || args.count(
"free-energy"))) {
604 std::cerr <<
"error: no output defined for populations or free energies.\n" 605 <<
" why did you define -R ?" << std::endl;
608 std::vector<float> radii = args[
"radii"].as<std::vector<float>>();
610 for(
auto const& radius: radii) {
617 Pops pops = Clustering::Density::CUDA::calculate_populations(coords
630 for (
auto radius_pops: pops) {
631 if (args.count(
"population")) {
632 std::string basename_pop = args[
"population"].as<std::string>() +
"_%f";
635 header_comment, commentsMap);
637 if (args.count(
"free-energy")) {
638 std::string basename_fe = args[
"free-energy"].as<std::string>() +
"_%f";
641 header_comment, commentsMap);
645 float radius_lump = 1.0;
648 if ( ! args.count(
"radius")) {
659 auto nh_tuple = Clustering::Density::CUDA::nearest_neighbors(coords
666 nh = std::get<0>(nh_tuple);
669 radius_lump = sqrt(4*sigma2);
671 commentsMap[
"lumping_radius"] = radius_lump;
673 Clustering::logger(std::cout) <<
" calculating free energy and population" << std::endl;
674 const float radius = (args.count(
"radius")) ? args[
"radius"].as<float>() : radius_lump;
676 commentsMap[
"clustering_radius"] = radius;
680 if (args.count(
"population")) {
682 << args[
"population"].as<std::string>() << std::endl;
683 write_pops(args[
"population"].as<std::string>(), pops, header_comment, commentsMap);
687 if (args.count(
"free-energy")) {
689 << args[
"free-energy"].as<std::string>() << std::endl;
690 write_fes(args[
"free-energy"].as<std::string>(), free_energies, header_comment, commentsMap);
698 if (args.count(
"nearest-neighbors-input")) {
700 << args[
"nearest-neighbors-input"].as<std::string>()
702 auto nh_pair =
read_neighborhood(args[
"nearest-neighbors-input"].as<std::string>());
704 nh_high_dens = nh_pair.second;
707 read_comments(args[
"nearest-neighbors-input"].as<std::string>(), commentsMap);
708 }
else if (args.count(
"nearest-neighbors") || args.count(
"output")) {
709 if (args.count(
"radii")) {
710 std::cerr <<
"error: nearest neighbor calculation cannot be done with\n" 711 <<
" several radii (-R is set)." << std::endl;
716 auto nh_tuple = Clustering::Density::CUDA::nearest_neighbors(coords
723 nh = std::get<0>(nh_tuple);
724 nh_high_dens = std::get<1>(nh_tuple);
726 if (commentsMap[
"lumping_radius"] == 0.) {
728 float radius_lump = sqrt(4*sigma2);
730 commentsMap[
"lumping_radius"] = radius_lump;
732 if (args.count(
"nearest-neighbors")) {
734 << args[
"nearest-neighbors"].as<std::string>() << std::endl;
739 if (args.count(
"output")) {
740 if (args.count(
"radii")) {
741 std::cerr <<
"error: output needs to depend on single radius\n" 742 <<
" several radii (-R is set)." << std::endl;
746 using Clustering::Density::CUDA::screening;
750 const std::string output_file = args[
"output"].as<std::string>();
751 std::vector<std::size_t> clustering;
752 if (args.count(
"input")) {
754 if (args.count(
"threshold-screening")) {
758 << args[
"input"].as<std::string>() << std::endl;
763 Clustering::logger(std::cout) <<
" assigning low density states to initial states" << std::endl;
768 Clustering::logger(std::cout) <<
" sorting and renaming states by decreasing population" << std::endl;
772 }
else if (args.count(
"threshold-screening")) {
774 std::vector<float> threshold_params = args[
"threshold-screening"].as<std::vector<float>>();
775 if (threshold_params.size() > 3) {
776 std::cerr <<
"error: option -T expects at most three floating point arguments: FROM STEP TO." << std::endl;
781 float t_to = *std::max_element(free_energies.begin(), free_energies.end());
782 if (threshold_params.size() >= 1 && threshold_params[0] >= 0.0f) {
783 t_from = threshold_params[0];
785 if (threshold_params.size() >= 2) {
786 t_step = threshold_params[1];
788 if (threshold_params.size() == 3) {
789 t_to = threshold_params[2];
793 std::cerr <<
"error: -T can handle at maximum two digits." << std::endl;
797 commentsMap[
"screening_to"] = t_to;
798 commentsMap[
"screening_from"] = t_from;
799 commentsMap[
"screening_step"] = t_step;
803 float t_to_low = t_to - t_step/10.0f + t_step;
804 float t_to_high = t_to + t_step/10.0f + t_step;
805 for (
float t=t_from; (t < t_to_low) && !(t_to_high < t); t += t_step) {
815 , clustering, header_comment, commentsMap);
818 std::cerr <<
"error: one of -T/-i is needed to generate output." << std::endl;
std::vector< std::size_t > assign_low_density_frames(const std::vector< std::size_t > &initial_clustering, const Neighborhood &nh_high_dens, const std::vector< float > &free_energy)
std::vector< std::size_t > normalized_cluster_names(std::size_t first_frame_above_threshold, std::vector< std::size_t > clustering, std::vector< FreeEnergy > &fe_sorted)
return clustered trajectory with new, distinct cluster names.
general namespace for clustering package
double compute_sigma2(const Neighborhood &nh)
const int N_NEIGHBOR_BOXES
number of neigbor boxes in 2D grid (including center box).
bool is_valid_box(const Box box, const BoxGrid &grid)
std::set< std::size_t > high_density_neighborhood(const float *coords, const std::size_t n_cols, const std::vector< FreeEnergy > &sorted_fe, const std::size_t i_frame, const std::size_t limit, const float max_dist)
bool lump_initial_clusters(const std::set< std::size_t > &local_nh, std::size_t &distinct_name, std::vector< std::size_t > &clustering, const std::vector< FreeEnergy > &fe_sorted, std::size_t first_frame_above_threshold)
lump clusters based on distance threshold in screening process
void main(boost::program_options::variables_map args)
user interface and controlling function for density-based geometric clustering.
Clustering::Tools::Neighborhood Neighborhood
map frame id to neighbors
std::vector< int > n_boxes
total number of boxes
std::map< Box, std::vector< int > > boxes
the boxes with a list of assigned frame ids
std::tuple< Neighborhood, Neighborhood > nearest_neighbors(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< float > &free_energy)
constexpr int BOX_DIFF[9][2]
std::ostream & logger(std::ostream &s)
std::tuple< std::vector< std::size_t >, std::size_t, double, std::vector< FreeEnergy >, std::set< std::size_t >, std::size_t > prepare_initial_clustering(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const std::size_t n_rows, const std::vector< std::size_t > initial_clusters)
BoxGrid compute_box_grid(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
std::vector< std::size_t > calculate_populations(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
calculate population of n-dimensional hypersphere per frame for one fixed radius. ...
constexpr Box neighbor_box(const Box center, const int i_neighbor)
std::map< int, unsigned int > CounterClustMap
map for sorting clusters by population
void screening_log(const double sigma2, const std::size_t first_frame_above_threshold, const std::vector< FreeEnergy > &fe_sorted)
log output for screening steps
std::vector< float > calculate_free_energies(const std::vector< std::size_t > &pops)
std::vector< FreeEnergy > sorted_free_energies(const std::vector< float > &fe)
bool has2digits(float val)
check if float has at maximum two digits
Density-based clustering.
std::pair< std::size_t, float > FreeEnergy
matches frame id to free energy
std::array< int, 2 > Box
encodes 2D box for box-assisted search algorithm
Clustering::Tools::Neighbor Neighbor
matches neighbor's frame id to distance
std::vector< Box > assigned_box
matching frame id to the frame's assigned box
std::vector< std::size_t > screening(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< std::size_t > initial_clusters)
bool compare2DVector(const std::pair< std::size_t, std::size_t > &p1, const std::pair< std::size_t, std::size_t > &p2)
compare two two dimensional pairs by their second entry
std::vector< std::size_t > sorted_cluster_names(std::vector< std::size_t > clustering)
sorts the cluster by decreasing population and renames them from 1..N