density_clustering.cpp
1 /*
2 Copyright (c) 2015-2019, Florian Sittel (www.lettis.net) and Daniel Nagel
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "tools.hpp"
27 #include "logger.hpp"
28 #include "density_clustering.hpp"
29 
30 #ifdef USE_CUDA
31  #include "density_clustering_cuda.hpp"
32 #else
34 #endif
35 
36 #include <algorithm>
37 
38 namespace Clustering {
39  namespace Density {
40  BoxGrid
41  compute_box_grid(const float* coords,
42  const std::size_t n_rows,
43  const std::size_t n_cols,
44  const float radius) {
45  // use first and second coordinates, since these usually
46  // correspond to first and second PCs, having highest variance.
47  // if clustering is only in 1 dimension, boxes of higher dimensions
48  // will be kept empty.
49  const int BOX_DIM_1 = 0;
50  const int BOX_DIM_2 = 1;
51  BoxGrid grid;
52  ASSUME_ALIGNED(coords);
53  // find min/max values for first and second dimension
54  float min_x1=0.0f, max_x1=0.0f, min_x2=0.0f, max_x2=0.0f;
55  min_x1=coords[0*n_cols+BOX_DIM_1];
56  max_x1=coords[0*n_cols+BOX_DIM_1];
57  if (n_cols > 1) {
58  min_x2=coords[0*n_cols+BOX_DIM_2];
59  max_x2=coords[0*n_cols+BOX_DIM_2];
60  }
61  Clustering::logger(std::cout) << "setting up boxes for fast NN search" << std::endl;
62  for (std::size_t i=1; i < n_rows; ++i) {
63  min_x1 = std::min(min_x1, coords[i*n_cols+BOX_DIM_1]);
64  max_x1 = std::max(max_x1, coords[i*n_cols+BOX_DIM_1]);
65  if (n_cols > 1) {
66  min_x2 = std::min(min_x2, coords[i*n_cols+BOX_DIM_2]);
67  max_x2 = std::max(max_x2, coords[i*n_cols+BOX_DIM_2]);
68  }
69  }
70  // build 2D grid with boxes for efficient nearest neighbor search
71  grid.n_boxes.push_back((max_x1 - min_x1) / radius + 1);
72  if (n_cols > 1) {
73  grid.n_boxes.push_back((max_x2 - min_x2) / radius + 1);
74  } else {
75  grid.n_boxes.push_back(1);
76  }
77  grid.assigned_box.resize(n_rows);
78  for (std::size_t i=0; i < n_rows; ++i) {
79  int i_box_1=0, i_box_2=0;
80  i_box_1 = (coords[i*n_cols+BOX_DIM_1] - min_x1) / radius;
81  if (n_cols > 1) {
82  i_box_2 = (coords[i*n_cols+BOX_DIM_2] - min_x2) / radius;
83  }
84  grid.assigned_box[i] = {i_box_1, i_box_2};
85  grid.boxes[grid.assigned_box[i]].push_back(i);
86  }
87  return grid;
88  }
89 
90  constexpr Box
91  neighbor_box(const Box center, const int i_neighbor) {
92  return {center[0] + BOX_DIFF[i_neighbor][0]
93  , center[1] + BOX_DIFF[i_neighbor][1]};
94  }
95 
96  bool
97  is_valid_box(const Box box, const BoxGrid& grid) {
98  int i1 = box[0];
99  int i2 = box[1];
100  return ((i1 >= 0)
101  && (i1 < grid.n_boxes[0])
102  && (i2 >= 0)
103  && (i2 < grid.n_boxes[1]));
104  }
105 
106  std::vector<std::size_t>
107  calculate_populations(const float* coords,
108  const std::size_t n_rows,
109  const std::size_t n_cols,
110  const float radius) {
111  std::vector<float> radii = {radius};
112 #ifdef USE_CUDA
113  std::map<float, std::vector<std::size_t>> pop_map =
114  Clustering::Density::CUDA::calculate_populations(coords
115  , n_rows
116  , n_cols
117  , radii);
118 #else
119  std::map<float, std::vector<std::size_t>> pop_map =
120  calculate_populations(coords, n_rows, n_cols, radii);
121 #endif
122  return pop_map[radius];
123  }
124 
125  std::map<float, std::vector<std::size_t>>
126  calculate_populations(const float* coords,
127  const std::size_t n_rows,
128  const std::size_t n_cols,
129  std::vector<float> radii) {
130  std::map<float, std::vector<std::size_t>> pops;
131  for (float rad: radii) {
132  pops[rad].resize(n_rows, 1);
133  }
134  std::sort(radii.begin(), radii.end(), std::greater<float>());
135  std::size_t n_radii = radii.size();
136  std::vector<float> rad2(n_radii);
137  for (std::size_t i=0; i < n_radii; ++i) {
138  rad2[i] = radii[i]*radii[i];
139  }
140  ASSUME_ALIGNED(coords);
141  std::size_t i, j, k, l, ib;
142  BoxGrid grid = compute_box_grid(coords, n_rows, n_cols, radii[0]);
143 // Clustering::logger(std::cout) << " box grid: "
144 // << grid.n_boxes[0]
145 // << " x "
146 // << grid.n_boxes[1]
147 // << std::endl;
148 // Clustering::logger(std::cout) << "computing pops" << std::endl;
149  float dist, c;
150  Box box;
151  Box center;
152  int i_neighbor;
153  std::vector<int> box_buffer;
154  #pragma omp parallel for default(none)\
155  private(i,box,box_buffer,center,i_neighbor,ib,dist,j,k,l,c)\
156  firstprivate(n_rows,n_cols,n_radii,radii,rad2,N_NEIGHBOR_BOXES)\
157  shared(coords,pops,grid)\
158  schedule(dynamic,1024)
159  for (i=0; i < n_rows; ++i) {
160  center = grid.assigned_box[i];
161  // loop over surrounding boxes to find neighbor candidates
162  for (i_neighbor=0; i_neighbor < N_NEIGHBOR_BOXES; ++i_neighbor) {
163  box = neighbor_box(center, i_neighbor);
164  if (is_valid_box(box, grid)) {
165  box_buffer = grid.boxes[box];
166  // loop over frames inside surrounding box
167  for (ib=0; ib < box_buffer.size(); ++ib) {
168  j = box_buffer[ib];
169  if (i < j) {
170  dist = 0.0f;
171  #pragma simd reduction(+:dist)
172  for (k=0; k < n_cols; ++k) {
173  c = coords[i*n_cols+k] - coords[j*n_cols+k];
174  dist += c*c;
175  }
176  for (l=0; l < n_radii; ++l) {
177  if (dist < rad2[l]) {
178  #pragma omp atomic
179  pops[radii[l]][i] += 1;
180  #pragma omp atomic
181  pops[radii[l]][j] += 1;
182  } else {
183  // if it's not in the bigger radius,
184  // it won't be in the smaller ones.
185  break;
186  }
187  }
188  }
189  }
190  }
191  }
192  }
193  return pops;
194  }
195 
196  std::vector<float>
197  calculate_free_energies(const std::vector<std::size_t>& pops) {
198  std::size_t i;
199  const std::size_t n_frames = pops.size();
200  const float max_pop = (float) (*std::max_element(pops.begin()
201  , pops.end()));
202  std::vector<float> fe(n_frames);
203  #pragma omp parallel for default(none)\
204  private(i)\
205  firstprivate(max_pop, n_frames)\
206  shared(fe, pops)
207  for (i=0; i < n_frames; ++i) {
208  fe[i] = (float) -1.0f * log(pops[i]/max_pop);
209  }
210  return fe;
211  }
212 
213  std::vector<FreeEnergy>
214  sorted_free_energies(const std::vector<float>& fe) {
215  std::vector<FreeEnergy> fe_sorted;
216  for (std::size_t i=0; i < fe.size(); ++i) {
217  fe_sorted.push_back(FreeEnergy(i, fe[i]));
218  }
219  // sort for free energy: lowest to highest
220  // (low free energy = high density)
221  std::sort(fe_sorted.begin(),
222  fe_sorted.end(),
223  [] (const FreeEnergy& d1, const FreeEnergy& d2) -> bool {
224  return d1.second < d2.second;
225  });
226  return fe_sorted;
227  }
228 
229  std::tuple<Neighborhood, Neighborhood>
230  nearest_neighbors(const float* coords,
231  const std::size_t n_rows,
232  const std::size_t n_cols,
233  const std::vector<float>& free_energy) {
234 //TODO: there is a small error somewhere that misclassifies frames as
235 // nearest neighbors. compare to results with CUDA-driven code
236 // (whose output was manually checked for correctness)
237  Neighborhood nh;
238  Neighborhood nh_high_dens;
239  // initialize neighborhood
240  for (std::size_t i=0; i < n_rows; ++i) {
241  nh[i] = Neighbor(n_rows+1
242  , std::numeric_limits<float>::max());
243  nh_high_dens[i] = Neighbor(n_rows+1
244  , std::numeric_limits<float>::max());
245  }
246  // calculate nearest neighbors with distances
247  std::size_t i, j, c, min_j, min_j_high_dens;
248  float dist, d, mindist, mindist_high_dens;
249  ASSUME_ALIGNED(coords);
250  #pragma omp parallel for default(none) \
251  private(i,j,c,dist,d,mindist,mindist_high_dens,min_j,min_j_high_dens)\
252  firstprivate(n_rows,n_cols) \
253  shared(coords,nh,nh_high_dens,free_energy) \
254  schedule(dynamic, 2048)
255  for (i=0; i < n_rows; ++i) {
256  mindist = std::numeric_limits<float>::max();
257  mindist_high_dens = std::numeric_limits<float>::max();
258  min_j = n_rows+1;
259  min_j_high_dens = n_rows+1;
260  for (j=0; j < n_rows; ++j) {
261  if (i != j) {
262  dist = 0.0f;
263  #pragma simd reduction(+:dist)
264  for (c=0; c < n_cols; ++c) {
265  d = coords[i*n_cols+c] - coords[j*n_cols+c];
266  dist += d*d;
267  }
268  // direct neighbor
269  if (dist < mindist) {
270  mindist = dist;
271  min_j = j;
272  }
273  // next neighbor with higher density / lower free energy
274  if (free_energy[j] < free_energy[i]
275  && dist < mindist_high_dens) {
276  mindist_high_dens = dist;
277  min_j_high_dens = j;
278  }
279  }
280  }
281  nh[i] = Neighbor(min_j
282  , mindist);
283  nh_high_dens[i] = Neighbor(min_j_high_dens
284  , mindist_high_dens);
285  }
286  return std::make_tuple(nh, nh_high_dens);
287  }
288 
289  // returns neighborhood set of single frame.
290  // all ids are sorted in free energy.
291  std::set<std::size_t>
292  high_density_neighborhood(const float* coords,
293  const std::size_t n_cols,
294  const std::vector<FreeEnergy>& sorted_fe,
295  const std::size_t i_frame,
296  const std::size_t limit,
297  const float max_dist) {
298  // buffer to hold information whether frame i is
299  // in neighborhood (-> assign 1) or not (-> keep 0)
300  std::vector<int> frame_in_nh(limit, 0);
301  std::set<std::size_t> nh;
302  std::size_t j,c;
303  const std::size_t i_frame_sorted = sorted_fe[i_frame].first * n_cols;
304  float d,dist2;
305  ASSUME_ALIGNED(coords);
306  #pragma omp parallel for default(none)\
307  private(j,c,d,dist2)\
308  firstprivate(i_frame,i_frame_sorted,limit,max_dist,n_cols)\
309  shared(coords,sorted_fe,frame_in_nh)
310  for (j=0; j < limit; ++j) {
311  if (i_frame != j) {
312  dist2 = 0.0f;
313  #pragma simd reduction(+:dist2)
314  for (c=0; c < n_cols; ++c) {
315  d = coords[i_frame_sorted+c] - coords[sorted_fe[j].first*n_cols+c];
316  dist2 += d*d;
317  }
318  if (dist2 < max_dist) {
319  frame_in_nh[j] = 1;
320  }
321  }
322  }
323  // reduce buffer data to real neighborhood structure
324  for (j=0; j < limit; ++j) {
325  if (frame_in_nh[j] > 0) {
326  nh.insert(j);
327  }
328  }
329  nh.insert(i_frame);
330  return nh;
331  }
332 
333  double
335  double sigma2 = 0.0;
336  for (auto match: nh) {
337  // 'first second': nearest neighbor info
338  // 'second second': squared dist to nearest neighbor
339  sigma2 += match.second.second;
340  }
341  return (sigma2 / nh.size());
342  }
343 
344  std::vector<std::size_t>
345  assign_low_density_frames(const std::vector<std::size_t>& initial_clustering,
346  const Neighborhood& nh_high_dens,
347  const std::vector<float>& free_energy) {
348  std::vector<FreeEnergy> fe_sorted = sorted_free_energies(free_energy);
349  std::vector<std::size_t> clustering(initial_clustering);
350  for (const auto& fe: fe_sorted) {
351  std::size_t id = fe.first;
352  if (clustering[id] == 0) {
353  std::size_t neighbor_id = nh_high_dens.find(id)->second.first;
354  // assign cluster of nearest neighbor with higher density
355  clustering[id] = clustering[neighbor_id];
356  }
357  }
358  return clustering;
359  }
360 
361  void
362  screening_log(const double sigma2
363  , const std::size_t first_frame_above_threshold
364  , const std::vector<FreeEnergy>& fe_sorted) {
365  logger(std::cout) << "sigma2: "
366  << sigma2
367  << std::endl
368  << first_frame_above_threshold
369  << " frames with low free energy / high density"
370  << std::endl
371  << "first frame above threshold has free energy: "
372  << fe_sorted[first_frame_above_threshold].second
373  << std::endl
374  << "merging initial clusters"
375  << std::endl
376  << std::endl
377  ;
378  }
379 
380 
381  std::tuple<std::vector<std::size_t>
382  , std::size_t
383  , double
384  , std::vector<FreeEnergy>
385  , std::set<std::size_t>
386  , std::size_t>
387  prepare_initial_clustering(const std::vector<float>& free_energy
388  , const Neighborhood& nh
389  , const float free_energy_threshold
390  , const std::size_t n_rows
391  , const std::vector<std::size_t> initial_clusters) {
392  std::vector<std::size_t> clustering;
393  bool have_initial_clusters = (initial_clusters.size() == n_rows);
394  if (have_initial_clusters) {
395  clustering = initial_clusters;
396  } else {
397  clustering = std::vector<std::size_t>(n_rows);
398  }
399  // sort lowest to highest (low free energy = high density)
400  std::vector<FreeEnergy> fe_sorted = sorted_free_energies(free_energy);
401  // find last frame below free energy threshold
402  auto lb = std::upper_bound(fe_sorted.begin()
403  , fe_sorted.end()
404  , FreeEnergy(0, free_energy_threshold)
405  , [](const FreeEnergy& d1
406  , const FreeEnergy& d2) -> bool {
407  return d1.second < d2.second;
408  });
409  std::size_t first_frame_above_threshold = (lb - fe_sorted.begin());
410  // compute sigma as deviation of nearest-neighbor distances
411  // (beware: actually, sigma2 is E[x^2] > Var(x) = E[x^2] - E[x]^2,
412  // with x being the distances between nearest neighbors).
413  // then compute a neighborhood with distance 4*sigma2 only on high density frames.
414  double sigma2 = compute_sigma2(nh);
415  // initialize distinct name from initial clustering
416  std::size_t distinct_name = *std::max_element(clustering.begin(), clustering.end());
417  std::set<std::size_t> visited_frames = {};
418  if (have_initial_clusters) {
419  // initialize visited_frames from initial clustering
420  // (with indices in order of sorted free energies)
421  for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
422  std::size_t i_original = fe_sorted[i].first;
423  if (initial_clusters[i_original] != 0) {
424  visited_frames.insert(i);
425  }
426  }
427  }
428  return std::make_tuple(clustering
429  , first_frame_above_threshold
430  , sigma2
431  , fe_sorted
432  , visited_frames
433  , distinct_name);
434  }
435 
436  std::vector<std::size_t>
437  normalized_cluster_names(std::size_t first_frame_above_threshold
438  , std::vector<std::size_t> clustering
439  , std::vector<FreeEnergy>& fe_sorted) {
440  std::set<std::size_t> final_names;
441  for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
442  final_names.insert(clustering[fe_sorted[i].first]);
443  }
444  std::map<std::size_t, std::size_t> old_to_new;
445  old_to_new[0] = 0;
446  std::size_t new_name=0;
447  for (auto name: final_names) {
448  old_to_new[name] = ++new_name;
449  }
450  // rewrite clustered trajectory with new names
451  for(auto& elem: clustering) {
452  elem = old_to_new[elem];
453  }
454  return clustering;
455  }
456 
457  std::vector<std::size_t>
458  sorted_cluster_names(std::vector<std::size_t> clustering) {
459 
460  std::size_t n_frames = clustering.size();
461 
462  // generate counting maps
463  typedef std::map<std::size_t,std::size_t> CounterClustMap;
464  CounterClustMap counts;
465  for (std::size_t i=0; i < n_frames; ++i) {
466  CounterClustMap::iterator it(counts.find(clustering[i]));
467  if (it != counts.end()){
468  it->second++;
469  } else {
470  counts[clustering[i]] = 1;
471  }
472  }
473 
474  // convert to vector for sorting
475  std::vector<std::pair<std::size_t,std::size_t>> counts_vec;
476  std::copy(counts.begin(), counts.end(), std::back_inserter(counts_vec));
477 
478  std::sort(counts_vec.begin(), counts_vec.end(), compare2DVector);
479 
480  // generate cluster name conversion map
481  CounterClustMap MapNames;
482  for(std::size_t i = 0; i < counts_vec.size(); ++i) {
483  MapNames[counts_vec[i].first] = counts_vec.size() - i;
484  }
485 
486  // remap cluster names
487  std::vector<std::size_t> remapped_clustering(n_frames);
488  for (std::size_t i = 0; i < n_frames; ++i) {
489  remapped_clustering[i] = MapNames[clustering[i]];
490  }
491  return remapped_clustering;
492  }
493 
494  bool
495  compare2DVector(const std::pair<std::size_t,std::size_t> &p1, const std::pair<std::size_t,std::size_t> &p2) {
496  return p1.second < p2.second;
497  }
498 
499  bool
500  has2digits(float val){
501  float val_2digits = (int)(val * 100) / 100.0;
502  return val_2digits == val;
503  }
504 
505  bool
506  lump_initial_clusters(const std::set<std::size_t>& local_nh
507  , std::size_t& distinct_name
508  , std::vector<std::size_t>& clustering
509  , const std::vector<FreeEnergy>& fe_sorted
510  , std::size_t first_frame_above_threshold) {
511  bool neighboring_clusters_merged = true;
512  // ... let's see if at least some of them already have a
513  // designated cluster assignment
514  std::set<std::size_t> cluster_names;
515  for (auto j: local_nh) {
516  cluster_names.insert(clustering[fe_sorted[j].first]);
517  }
518  if ( ! (cluster_names.size() == 1
519  && cluster_names.count(0) != 1)) {
520  neighboring_clusters_merged = false;
521  // remove the 'zero' state, i.e. state of unassigned frames
522  if (cluster_names.count(0) == 1) {
523  cluster_names.erase(0);
524  }
525  std::size_t common_name;
526  if (cluster_names.size() > 0) {
527  // indeed, there are already cluster assignments.
528  // these should now be merged under a common name.
529  // (which will be the id with smallest numerical value,
530  // due to the properties of STL-sets).
531  common_name = (*cluster_names.begin());
532  } else {
533  // no clustering of these frames yet.
534  // choose a distinct name.
535  common_name = ++distinct_name;
536  }
537  for (auto j: local_nh) {
538  clustering[fe_sorted[j].first] = common_name;
539  }
540  std::size_t j,ndx;
541  #pragma omp parallel for default(none) private(j,ndx)\
542  firstprivate(common_name,\
543  first_frame_above_threshold,\
544  cluster_names)\
545  shared(clustering,fe_sorted)
546  for (j=0; j < first_frame_above_threshold; ++j) {
547  ndx = fe_sorted[j].first;
548  if (cluster_names.count(clustering[ndx]) == 1) {
549  clustering[ndx] = common_name;
550  }
551  }
552  }
553  return neighboring_clusters_merged;
554  }
555 
556 
557 #ifndef DC_USE_MPI
558  void
559  main(boost::program_options::variables_map args) {
560 
561 //TODO check if files have been read correctly
562 
563  using namespace Clustering::Tools;
564  const std::string input_file = args["file"].as<std::string>();
565  // setup coords
566  float* coords;
567  std::size_t n_rows;
568  std::size_t n_cols;
569 // Clustering::logger(std::cout) << "reading coords" << std::endl;
570  std::tie(coords, n_rows, n_cols) = read_coords<float>(input_file);
571  std::string header_comment = args["header"].as<std::string>();
572  std::map<std::string,float> commentsMap = args["commentsMap"].as<std::map<std::string,float>>();
574  std::vector<float> free_energies;
575 
576  if (args.count("input") && (args.count("free-energy") || args.count("nearest-neighbors"))) {
577  std::cerr << "error: for input (-i) -D/-B should be used." << std::endl;
578  exit(EXIT_FAILURE);
579  }
580 
581  Clustering::logger(std::cout) << "~~~ free energy and population" << std::endl;
582  if (args.count("free-energy-input")) {
583  Clustering::logger(std::cout) << " re-using free energy: "
584  << args["free-energy-input"].as<std::string>()
585  << std::endl;
586  if (args.count("radii") || args.count("radius")) {
587  Clustering::logger(std::cout) << "warning: radius (-r/-R) is ignored" << std::endl;
588  }
589  if (args.count("free-energy") || args.count("population")) {
590  Clustering::logger(std::cout) << "warning: -p/-d flags are ignored" << std::endl;
591  }
592  free_energies = read_free_energies(args["free-energy-input"].as<std::string>());
593  // read previously used parameters
594  read_comments(args["free-energy-input"].as<std::string>(), commentsMap);
595  } else if (args.count("free-energy") || args.count("population") || args.count("output")) {
596  if (args.count("radii")) {
597  Clustering::logger(std::cout) << " calculating free energy and population" << std::endl;
598  // compute populations & free energies for different radii in one go
599  if (args.count("output")) {
600  std::cerr << "error: clustering cannot be done with several radii (-R is set)." << std::endl;
601  exit(EXIT_FAILURE);
602  }
603  if ( ! (args.count("population") || args.count("free-energy"))) {
604  std::cerr << "error: no output defined for populations or free energies.\n"
605  << " why did you define -R ?" << std::endl;
606  exit(EXIT_FAILURE);
607  }
608  std::vector<float> radii = args["radii"].as<std::vector<float>>();
609  Clustering::logger(std::cout) << " using radii: ";
610  for(auto const& radius: radii) {
611  Clustering::logger(std::cout) << radius << ", ";
612  }
613  Clustering::logger(std::cout) << "\b\b " << std::endl;;
614 
615 #ifdef USE_CUDA
616  Clustering::logger(std::cout) << " using CUDA" << std::endl;
617  Pops pops = Clustering::Density::CUDA::calculate_populations(coords
618  , n_rows
619  , n_cols
620  , radii);
621 #else
622  Clustering::logger(std::cout) << " using CPU" << std::endl;
623  Pops pops = calculate_populations(coords
624  , n_rows
625  , n_cols
626  , radii);
627 #endif
628 
629  Clustering::logger(std::cout) << " storing results" << std::endl;
630  for (auto radius_pops: pops) {
631  if (args.count("population")) {
632  std::string basename_pop = args["population"].as<std::string>() + "_%f";
633  write_pops(Clustering::Tools::stringprintf(basename_pop, radius_pops.first),
634  radius_pops.second,
635  header_comment, commentsMap);
636  }
637  if (args.count("free-energy")) {
638  std::string basename_fe = args["free-energy"].as<std::string>() + "_%f";
639  write_fes(Clustering::Tools::stringprintf(basename_fe, radius_pops.first),
640  calculate_free_energies(radius_pops.second),
641  header_comment, commentsMap);
642  }
643  }
644  } else {
645  float radius_lump = 1.0;
646  // if no radius passed, use clustering radius = lumping radius.
647  // TODO: Optimize, only sigma is needed. No need of fe, nn_hd
648  if ( ! args.count("radius")) {
649  Clustering::logger(std::cout) << " computing lumping radius" << std::endl;
650 
651  std::vector<std::size_t> pops = calculate_populations(coords,
652  n_rows,
653  n_cols,
654  radius_lump);
655  free_energies = calculate_free_energies(pops);
656 
657  Neighborhood nh;
658 #ifdef USE_CUDA
659  auto nh_tuple = Clustering::Density::CUDA::nearest_neighbors(coords
660  , n_rows
661  , n_cols
662  , free_energies);
663 #else
664  auto nh_tuple = nearest_neighbors(coords, n_rows, n_cols, free_energies);
665 #endif
666  nh = std::get<0>(nh_tuple);
667 
668  double sigma2 = compute_sigma2(nh);
669  radius_lump = sqrt(4*sigma2);
670  Clustering::logger(std::cout) << " d_lump=" << radius_lump << std::endl;
671  commentsMap["lumping_radius"] = radius_lump;
672  }
673  Clustering::logger(std::cout) << " calculating free energy and population" << std::endl;
674  const float radius = (args.count("radius")) ? args["radius"].as<float>() : radius_lump;
675  Clustering::logger(std::cout) << " using radius: " << radius << std::endl;
676  commentsMap["clustering_radius"] = radius;
677  // compute populations & free energies for clustering and/or saving
678 // Clustering::logger(std::cout) << "calculating populations" << std::endl;
679  std::vector<std::size_t> pops = calculate_populations(coords, n_rows, n_cols, radius);
680  if (args.count("population")) {
681  Clustering::logger(std::cout) << " storing population in: "
682  << args["population"].as<std::string>() << std::endl;
683  write_pops(args["population"].as<std::string>(), pops, header_comment, commentsMap);
684  }
685 // Clustering::logger(std::cout) << "calculating free energies" << std::endl;
686  free_energies = calculate_free_energies(pops);
687  if (args.count("free-energy")) {
688  Clustering::logger(std::cout) << " storing free energy in: "
689  << args["free-energy"].as<std::string>() << std::endl;
690  write_fes(args["free-energy"].as<std::string>(), free_energies, header_comment, commentsMap);
691  }
692  }
693  }
695  Neighborhood nh;
696  Neighborhood nh_high_dens;
697  Clustering::logger(std::cout) << "\n~~~ nearest neighbors" << std::endl;
698  if (args.count("nearest-neighbors-input")) {
699  Clustering::logger(std::cout) << " re-using nearest neighbor: "
700  << args["nearest-neighbors-input"].as<std::string>()
701  << std::endl;
702  auto nh_pair = read_neighborhood(args["nearest-neighbors-input"].as<std::string>());
703  nh = nh_pair.first;
704  nh_high_dens = nh_pair.second;
705 
706  // check if nn is consistent with fe, otherwise warning will be printed
707  read_comments(args["nearest-neighbors-input"].as<std::string>(), commentsMap);
708  } else if (args.count("nearest-neighbors") || args.count("output")) {
709  if (args.count("radii")) {
710  std::cerr << "error: nearest neighbor calculation cannot be done with\n"
711  << " several radii (-R is set)." << std::endl;
712  exit(EXIT_FAILURE);
713  }
714  Clustering::logger(std::cout) << " calculating nearest neighbors" << std::endl;
715 #ifdef USE_CUDA
716  auto nh_tuple = Clustering::Density::CUDA::nearest_neighbors(coords
717  , n_rows
718  , n_cols
719  , free_energies);
720 #else
721  auto nh_tuple = nearest_neighbors(coords, n_rows, n_cols, free_energies);
722 #endif
723  nh = std::get<0>(nh_tuple);
724  nh_high_dens = std::get<1>(nh_tuple);
725 
726  if (commentsMap["lumping_radius"] == 0.) {
727  double sigma2 = compute_sigma2(nh);
728  float radius_lump = sqrt(4*sigma2);
729  Clustering::logger(std::cout) << " lumping radius: " << radius_lump << std::endl;
730  commentsMap["lumping_radius"] = radius_lump;
731  }
732  if (args.count("nearest-neighbors")) {
733  Clustering::logger(std::cout) << " storing nearest neighbors in: "
734  << args["nearest-neighbors"].as<std::string>() << std::endl;
735  Clustering::Tools::write_neighborhood(args["nearest-neighbors"].as<std::string>(), nh, nh_high_dens, header_comment, commentsMap);
736  }
737  }
739  if (args.count("output")) {
740  if (args.count("radii")) {
741  std::cerr << "error: output needs to depend on single radius\n"
742  << " several radii (-R is set)." << std::endl;
743  exit(EXIT_FAILURE);
744  }
745 #ifdef USE_CUDA
746  using Clustering::Density::CUDA::screening;
747 #else
749 #endif
750  const std::string output_file = args["output"].as<std::string>();
751  std::vector<std::size_t> clustering;
752  if (args.count("input")) {
753  Clustering::logger(std::cout) << "~~~ generating microstates" << std::endl;
754  if (args.count("threshold-screening")) {
755  Clustering::logger(std::cout) << "warning: screening (-T) is ignored" << std::endl;
756  }
757  Clustering::logger(std::cout) << " reading initial states: "
758  << args["input"].as<std::string>() << std::endl;
759  clustering = read_clustered_trajectory(args["input"].as<std::string>());
760 
761  // read and check consistency of comments
762  read_comments(args["input"].as<std::string>(), commentsMap);
763  Clustering::logger(std::cout) << " assigning low density states to initial states" << std::endl;
764  clustering = assign_low_density_frames(clustering
765  , nh_high_dens
766  , free_energies);
767  // sort, rename and save states
768  Clustering::logger(std::cout) << " sorting and renaming states by decreasing population" << std::endl;
769  clustering = sorted_cluster_names(clustering);
770  Clustering::logger(std::cout) << " storing states in: " << output_file << std::endl;
771  write_clustered_trajectory(output_file, clustering, header_comment, commentsMap);
772  } else if (args.count("threshold-screening")) {
773  Clustering::logger(std::cout) << "\n~~~ free energy screening" << std::endl;
774  std::vector<float> threshold_params = args["threshold-screening"].as<std::vector<float>>();
775  if (threshold_params.size() > 3) {
776  std::cerr << "error: option -T expects at most three floating point arguments: FROM STEP TO." << std::endl;
777  exit(EXIT_FAILURE);
778  }
779  float t_from = 0.1;
780  float t_step = 0.1;
781  float t_to = *std::max_element(free_energies.begin(), free_energies.end());
782  if (threshold_params.size() >= 1 && threshold_params[0] >= 0.0f) {
783  t_from = threshold_params[0];
784  }
785  if (threshold_params.size() >= 2) {
786  t_step = threshold_params[1];
787  }
788  if (threshold_params.size() == 3) {
789  t_to = threshold_params[2];
790  }
791  // check if input is correctly formatted
792  if (! (has2digits(t_from) && has2digits(t_step))){
793  std::cerr << "error: -T can handle at maximum two digits." << std::endl;
794  exit(EXIT_FAILURE);
795  }
796  // set comments
797  commentsMap["screening_to"] = t_to;
798  commentsMap["screening_from"] = t_from;
799  commentsMap["screening_step"] = t_step;
800  Clustering::logger(std::cout) << "\n fe frames" << std::endl;
801  // upper limit extended to a 10th of the stepsize to
802  // circumvent rounding errors when comparing on equality
803  float t_to_low = t_to - t_step/10.0f + t_step;
804  float t_to_high = t_to + t_step/10.0f + t_step;
805  for (float t=t_from; (t < t_to_low) && !(t_to_high < t); t += t_step) {
806  // compute clusters, re-using old results from previous step
807  clustering = screening(free_energies
808  , nh
809  , t
810  , coords
811  , n_rows
812  , n_cols
813  , clustering);
815  , clustering, header_comment, commentsMap);
816  }
817  } else {
818  std::cerr << "error: one of -T/-i is needed to generate output." << std::endl;
819  exit(EXIT_FAILURE);
820  }
821  }
822  Clustering::logger(std::cout) << "~~~ freeing memory" << std::endl;
823  free_coords(coords);
824  }
825 #endif
826  } // end namespace Density
827 } // end namespace Clustering
828 
void write_pops(std::string filename, std::vector< std::size_t > pops, std::string header_comment, std::map< std::string, float > stringMap)
write populations as column into given file
Definition: tools.cpp:50
std::vector< std::size_t > assign_low_density_frames(const std::vector< std::size_t > &initial_clustering, const Neighborhood &nh_high_dens, const std::vector< float > &free_energy)
additional tools used throughout the clustering package
Definition: tools.cpp:33
std::vector< std::size_t > normalized_cluster_names(std::size_t first_frame_above_threshold, std::vector< std::size_t > clustering, std::vector< FreeEnergy > &fe_sorted)
return clustered trajectory with new, distinct cluster names.
general namespace for clustering package
Definition: coring.cpp:38
Tools mainly for IO and some other functions.
double compute_sigma2(const Neighborhood &nh)
#define ASSUME_ALIGNED(c)
needed for aligned memory allocation for Xeon Phi, SSE or AVX
Definition: tools.hpp:47
const int N_NEIGHBOR_BOXES
number of neigbor boxes in 2D grid (including center box).
bool is_valid_box(const Box box, const BoxGrid &grid)
std::set< std::size_t > high_density_neighborhood(const float *coords, const std::size_t n_cols, const std::vector< FreeEnergy > &sorted_fe, const std::size_t i_frame, const std::size_t limit, const float max_dist)
std::vector< float > read_free_energies(std::string filename)
read free energies from plain text file
Definition: tools.cpp:96
bool lump_initial_clusters(const std::set< std::size_t > &local_nh, std::size_t &distinct_name, std::vector< std::size_t > &clustering, const std::vector< FreeEnergy > &fe_sorted, std::size_t first_frame_above_threshold)
lump clusters based on distance threshold in screening process
void main(boost::program_options::variables_map args)
user interface and controlling function for density-based geometric clustering.
Clustering::Tools::Neighborhood Neighborhood
map frame id to neighbors
Define global logger.
void free_coords(NUM *coords)
free memory pointing to coordinates
Definition: tools.hxx:116
std::vector< int > n_boxes
total number of boxes
std::map< Box, std::vector< int > > boxes
the boxes with a list of assigned frame ids
std::tuple< Neighborhood, Neighborhood > nearest_neighbors(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< float > &free_energy)
constexpr int BOX_DIFF[9][2]
void write_neighborhood(const std::string filename, const Neighborhood &nh, const Neighborhood &nh_high_dens, std::string header_comment, std::map< std::string, float > stringMap)
Definition: tools.cpp:144
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
void read_comments(std::string filename, std::map< std::string, float > &stringMap)
read comments of stringMap from file. Comments should start with #@
Definition: tools.cpp:229
std::pair< Neighborhood, Neighborhood > read_neighborhood(const std::string filename)
Definition: tools.cpp:101
std::tuple< std::vector< std::size_t >, std::size_t, double, std::vector< FreeEnergy >, std::set< std::size_t >, std::size_t > prepare_initial_clustering(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const std::size_t n_rows, const std::vector< std::size_t > initial_clusters)
BoxGrid compute_box_grid(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
std::vector< std::size_t > calculate_populations(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
calculate population of n-dimensional hypersphere per frame for one fixed radius. ...
constexpr Box neighbor_box(const Box center, const int i_neighbor)
std::map< int, unsigned int > CounterClustMap
map for sorting clusters by population
Definition: noise.hpp:50
void write_fes(std::string filename, std::vector< float > fes, std::string header_comment, std::map< std::string, float > stringMap)
write free energies as column into given file
Definition: tools.cpp:42
void screening_log(const double sigma2, const std::size_t first_frame_above_threshold, const std::vector< FreeEnergy > &fe_sorted)
log output for screening steps
std::vector< float > calculate_free_energies(const std::vector< std::size_t > &pops)
std::vector< FreeEnergy > sorted_free_energies(const std::vector< float > &fe)
void write_clustered_trajectory(std::string filename, std::vector< std::size_t > traj, std::string header_comment, std::map< std::string, float > stringMap)
write state trajectory into plain text file
Definition: tools.cpp:63
bool has2digits(float val)
check if float has at maximum two digits
Density-based clustering.
std::pair< std::size_t, float > FreeEnergy
matches frame id to free energy
std::array< int, 2 > Box
encodes 2D box for box-assisted search algorithm
Clustering::Tools::Neighbor Neighbor
matches neighbor&#39;s frame id to distance
std::vector< Box > assigned_box
matching frame id to the frame&#39;s assigned box
std::vector< std::size_t > read_clustered_trajectory(std::string filename)
read states from trajectory (given as plain text file)
Definition: tools.cpp:58
std::string stringprintf(const std::string &str,...)
printf-version for std::string
Definition: tools.cpp:80
std::vector< std::size_t > screening(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< std::size_t > initial_clusters)
bool compare2DVector(const std::pair< std::size_t, std::size_t > &p1, const std::pair< std::size_t, std::size_t > &p2)
compare two two dimensional pairs by their second entry
std::vector< std::size_t > sorted_cluster_names(std::vector< std::size_t > clustering)
sorts the cluster by decreasing population and renames them from 1..N