network_builder.cpp
1 /*
2 Copyright (c) 2015-2019, Florian Sittel (www.lettis.net) and Daniel Nagel
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "network_builder.hpp"
27 
28 #include "tools.hpp"
29 #include "logger.hpp"
30 #include "embedded_cytoscape.hpp"
31 
32 #include <fstream>
33 #include <set>
34 #include <unordered_set>
35 #include <limits>
36 
37 #include <boost/program_options.hpp>
38 #include <boost/filesystem.hpp>
39 //#include <boost/progress.hpp>
40 #include <omp.h>
41 
42 
43 namespace {
47  int HORIZONTAL_SPACING = 10;
48  int VERTICAL_SPACING = 50;
49 // int NODE_SIZE_MIN = 5;
50 // int NODE_SIZE_MAX = 50;
51  // the following values will be set by
52  // 'save_network_to_html'
53  // and are stored here for convenience.
54  // needed by 'Node'-instances for
55  // construction of graphical network,
56  // i.e. to determine node sizes and colors.
57  std::size_t POP_MIN = 0;
58  std::size_t POP_MAX = 0;
59  float FE_MIN = 0.0f;
60  float FE_MAX = 0.0f;
62 
63  struct Node {
64  std::size_t id;
65  float fe;
66  std::size_t pop;
67  std::map<std::size_t, Node> children;
68  int pos_x = 0;
69  int pos_y = 0;
70  int _subtree_width = 0;
71 
72  Node();
73  Node(std::size_t _id, float _fe, std::size_t _pop);
74  Node* find_parent_of(std::size_t search_id);
75  void set_pos(int x, int y);
76  int subtree_width();
77  void print_subtree(std::ostream& os);
78  void print_node_and_subtree(std::ostream& os);
79  };
80 
81  constexpr bool fuzzy_equal(float a, float b, float prec) {
82  return (a <= b + prec) && (a >= b - prec);
83  }
84 
85  // overload output operator for Node-serialization
86  // (producing string representation of node + edges to children)
87  std::ostream& operator<<(std::ostream& os, const Node& n) {
88  float log_pop;
89  if (n.pop <= 0) {
90  log_pop = log(1);
91  } else {
92  log_pop = log(n.pop);
93  }
94  // print node itself
95  os << Clustering::Tools::stringprintf("{group:'nodes',id:'n%d',position:{x:%d,y:%d},data:{id:'n%d',pop:%d,fe:%f,info:'%d: fe=%0.2f, pop=%d',logpop:%0.2f}},\n",
96  n.id, n.pos_x, n.pos_y, n.id, n.pop, n.fe, n.id, n.fe, n.pop, log_pop);
97  // print edges from node's children to node
98  for (auto& id_child: n.children) {
99  std::size_t cid = id_child.first;
100  os << Clustering::Tools::stringprintf("{group:'edges',data:{id:'e%d_%d',source:'n%d',target:'n%d'}},\n", cid, n.id, cid, n.id);
101  }
102  return os;
103  }
104 
105  Node::Node() {}
106 
107  Node::Node(std::size_t _id,
108  float _fe,
109  std::size_t _pop)
110  : id(_id)
111  , fe(_fe)
112  , pop(_pop) {
113  }
114 
115  Node* Node::find_parent_of(std::size_t search_id) {
116  if (this->children.count(search_id)) {
117  return this;
118  } else {
119  for (auto& id_child: children) {
120  Node* cc = id_child.second.find_parent_of(search_id);
121  if (cc) {
122  return cc;
123  }
124  }
125  }
126  return NULL;
127  }
128 
129  void Node::set_pos(int x, int y) {
130  this->pos_x = x;
131  this->pos_y = y;
132  std::vector<int> width_children;
133  int total_width = 0;
134  for (auto& id_child: children) {
135  width_children.push_back(id_child.second.subtree_width());
136  total_width += id_child.second.subtree_width();
137  }
138  // set pos for every child recursively,
139  // regarding proper segmentation of horizontal space.
140  int cur_x = (x-0.5*total_width);
141  for (auto& id_child: children) {
142  int stw = id_child.second.subtree_width();
143  id_child.second.set_pos(cur_x + 0.5*stw, y + VERTICAL_SPACING);
144  cur_x += stw;
145  }
146  }
147 
148  int Node::subtree_width() {
149  // check for backtracked value
150  // and compute if not existing.
151  if ( ! this->_subtree_width) {
152  int self_width = 10 + 2*HORIZONTAL_SPACING; //TODO get from size
153  if (children.empty()) {
154  this->_subtree_width = self_width;
155  } else {
156  int sum = 0;
157  for (auto& id_child: children) {
158  sum += id_child.second.subtree_width();
159  }
160  if (sum > self_width) {
161  this->_subtree_width = sum;
162  } else {
163  this->_subtree_width = self_width;
164  }
165  }
166  }
167  return this->_subtree_width;
168  }
169 
170  void Node::print_subtree(std::ostream& os) {
171  for (auto& id_child: children) {
172  id_child.second.print_node_and_subtree(os);
173  }
174  }
175 
176  void Node::print_node_and_subtree(std::ostream& os) {
177  os << (*this) << std::endl;
178  this->print_subtree(os);
179  }
180 
181 
182  void
183  save_network_links(std::string fname, std::map<std::size_t, std::size_t> network,
184  std::string header_comment, std::map<std::string,float> commentsMap) {
185  fname.append("_links.dat");
186  Clustering::logger(std::cout) << " saving links in: " << fname << std::endl;
187  Clustering::Tools::append_commentsMap(header_comment, commentsMap);
188  header_comment.append("#\n# Name of the cluster connected to the name in next "
189  "higher free energy level\n# Named by the remapped clusters.\n#\n"
190  "# cluster_name(fe+step) cluster_name(fe)\n");
191  Clustering::Tools::write_map<std::size_t, std::size_t>(fname, network, header_comment, true);
192  }
193 
194  void
195  save_node_info(std::string fname,
196  std::map<std::size_t, float> free_energies,
197  std::map<std::size_t, std::size_t> pops,
198  std::string header_comment,
199  std::map<std::string,float> commentsMap) {
200  fname.append("_nodes.dat");
201  Clustering::logger(std::cout) << " saving nodes in: " << fname << std::endl;
202  Clustering::Tools::append_commentsMap(header_comment, commentsMap);
203  header_comment.append("#\n# nodes\n");
204  header_comment.append("#\n# Name of all clusters at a given free energies (fe) "
205  "with the corresponding populations pop.\n"
206  "# id(cluster) fe pop\n");
207  std::ofstream ofs(fname);
208  if (ofs.fail()) {
209  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
210  exit(EXIT_FAILURE);
211  } else {
212  ofs << header_comment;
213  for (auto node_pop: pops) {
214  std::size_t key = node_pop.first;
215  ofs << key << " " << free_energies[key] << " " << node_pop.second << "\n";
216  }
217  }
218  }
219 
220  std::set<std::size_t>
221  compute_and_save_leaves(std::string fname,
222  std::map<std::size_t, std::size_t> network,
223  std::string header_comment,
224  std::map<std::string,float> commentsMap) {
225  fname.append("_leaves.dat");
226  Clustering::logger(std::cout) << " saving leaves in: " << fname << std::endl;
227  std::set<std::size_t> leaves;
228  std::set<std::size_t> not_leaves;
229  for (auto from_to: network) {
230  std::size_t src = from_to.first;
231  std::size_t target = from_to.second;
232  not_leaves.insert(target);
233  if (not_leaves.count(src)) {
234  leaves.erase(src);
235  } else {
236  leaves.insert(src);
237  }
238  }
239  std::vector<std::size_t> leaves_vec( leaves.begin(), leaves.end() );
240  Clustering::Tools::append_commentsMap(header_comment, commentsMap);
241  header_comment.append("#\n# All network leaves, i.e. nodes (microstates) without child\n"
242  "# nodes at a lower free energy level. These microstates represent\n"
243  "# the minima of their local basins.\n#\n"
244  "# id(cluster)\n"
245  );
246  Clustering::Tools::write_single_column<std::size_t>(fname, leaves_vec, header_comment, false);
247  return leaves;
248  }
249 
250  void
251  save_traj_of_leaves(std::string fname,
252  std::set<std::size_t> leaves,
253  float d_min,
254  float d_max,
255  float d_step,
256  std::string remapped_name,
257  std::size_t n_rows,
258  std::string header_comment,
259  std::map<std::string,float> commentsMap) {
260  fname.append("_end_node_traj.dat");
261  Clustering::logger(std::cout) << " saving end-node trajectory in: " << fname << std::endl;
262  std::vector<std::size_t> traj(n_rows);
263  const float prec = d_step / 10.0f;
264  for (float d=d_min; ! fuzzy_equal(d, d_max+d_step, prec); d += d_step) {
265  std::vector<std::size_t> cl_now = Clustering::Tools::read_clustered_trajectory(
266  Clustering::Tools::stringprintf(remapped_name, d));
267  for (std::size_t i=0; i < n_rows; ++i) {
268  if (leaves.count(cl_now[i])) {
269  traj[i] = cl_now[i];
270  }
271  }
272  }
273  Clustering::Tools::append_commentsMap(header_comment, commentsMap);
274  header_comment.append("#\n# All frames beloning to a leaf node are marked with\n"
275  "# the custer id. All others with zero.\n");
276  header_comment.append("#\n# state/cluster id frames are assigned to\n");
277  Clustering::Tools::write_single_column<std::size_t>(fname, traj, header_comment);
278  }
279 
280  void
281  save_network_to_html(std::string fname,
282  std::map<std::size_t, std::size_t> network,
283  std::map<std::size_t, float> free_energies,
284  std::map<std::size_t, std::size_t> pops) {
285  Clustering::logger(std::cout) << "\n~~~ computing network visualization" << std::endl;
286  // set (global) values for min/max of free energies and populations
287  FE_MAX = std::max_element(free_energies.begin(),
288  free_energies.end(),
289  [](std::pair<std::size_t, float> fe1,
290  std::pair<std::size_t, float> fe2) -> bool {
291  return fe1.second < fe2.second;
292  })->second;
293  FE_MIN = std::min_element(free_energies.begin(),
294  free_energies.end(),
295  [](std::pair<std::size_t, float> fe1,
296  std::pair<std::size_t, float> fe2) -> bool {
297  return fe1.second < fe2.second;
298  })->second;
299  POP_MAX = std::max_element(pops.begin(),
300  pops.end(),
301  [](std::pair<std::size_t, std::size_t> p1,
302  std::pair<std::size_t, std::size_t> p2) -> bool {
303  return p1.second < p2.second;
304  })->second;
305  POP_MIN = std::min_element(pops.begin(),
306  pops.end(),
307  [](std::pair<std::size_t, std::size_t> p1,
308  std::pair<std::size_t, std::size_t> p2) -> bool {
309  return p1.second < p2.second;
310  })->second;
311  // build trees from given network with respective 'root' on top and at highest FE.
312  // may be multiple trees because there may be multiple nodes that have max FE.
313  Node fake_root;
314  //std::size_t network_size = network.size();
315  //boost::progress_display show_progress(network_size);
316  for (auto from_to: network) {
317  //++show_progress;
318 
319  std::size_t i_from = from_to.first;
320  std::size_t i_to = from_to.second;
321 
322  Node* parent_to = fake_root.find_parent_of(i_to);
323  if ( ! parent_to) {
324  fake_root.children[i_to] = {i_to, free_energies[i_to], pops[i_to]};
325  parent_to = &fake_root;
326  }
327  Node* parent_from = fake_root.find_parent_of(i_from);
328  if (parent_from) {
329  // move existing node to its right place in the tree
330  parent_to->children[i_to].children[i_from] = parent_from->children[i_from];
331  parent_from->children.erase(i_from);
332  } else {
333  // create child at proper place in tree
334  parent_to->children[i_to].children[i_from] = {i_from, free_energies[i_from], pops[i_from]};
335  }
336  }
337  Clustering::logger(std::cout) << " ...done" << std::endl;
338  // write header
339  fname.append("_visualization.html");
340  std::ofstream ofs(fname);
341  if (ofs.fail()) {
342  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
343  exit(EXIT_FAILURE);
344  } else {
345  float LOG_POP_MIN, LOG_POP_MAX;
346  if (POP_MIN <= 0) {
347  LOG_POP_MIN = 0.0f;
348  } else {
349  LOG_POP_MIN = log(POP_MIN);
350  }
351  if (POP_MAX <= 0) {
352  LOG_POP_MAX = 0.0f;
353  } else {
354  LOG_POP_MAX = log(POP_MAX);
355  }
356  ofs << Clustering::Network::viewer_header
357 
358  << "style: cytoscape.stylesheet().selector('node').css({"
359  << Clustering::Tools::stringprintf("'width': 'mapData(logpop, %0.2f, %0.2f, 5, 30)',", LOG_POP_MIN, LOG_POP_MAX)
360  << Clustering::Tools::stringprintf("'height': 'mapData(logpop, %0.2f, %0.2f, 5, 30)',", LOG_POP_MIN, LOG_POP_MAX)
361  << Clustering::Tools::stringprintf("'background-color': 'mapData(fe, %f, %f, blue, red)'})", FE_MIN, FE_MAX)
362 
363  << ".selector('edge').css({'opacity': '1.0', 'width': '5', 'target-arrow-shape': 'triangle'})"
364  << ".selector(':selected').css({'content': 'data(info)', 'font-size': 24, 'color': '#00ff00'})"
365 
366  << ", elements: [\n";
367  fake_root.set_pos(0, 0);
368  fake_root.print_subtree(ofs);
369  ofs << "]";
370  ofs << Clustering::Network::viewer_footer;
371  }
372  }
373 } // end local namespace
374 
375 
376 namespace Clustering {
377 namespace NetworkBuilder {
378 
379  void
380  main(boost::program_options::variables_map args) {
381  namespace b_fs = boost::filesystem;
382  using namespace Clustering::Tools;
383  // setup two threads parallel read/write
384  omp_set_num_threads(2);
385  // setup general flags / options
386  Clustering::verbose = args["verbose"].as<bool>();
387 
388  float d_min = args["min"].as<float>();
389  float d_max = args["max"].as<float>();
390  float d_step = args["step"].as<float>();
391  std::string basename = args["basename"].as<std::string>();
392  std::string basename_output = args["output"].as<std::string>();
393  basename.append(".%0.2f");
394  std::string remapped_name = "remapped_" + basename;
395  std::size_t minpop = args["minpop"].as<std::size_t>();
396  bool network_html = args["network-html"].as<bool>();
397  std::string header_comment = args["header"].as<std::string>();
398  std::map<std::string,float> commentsMap = args["commentsMap"].as<std::map<std::string,float>>();
399 
400  std::map<std::size_t, std::size_t> network;
401  std::map<std::size_t, std::size_t> pops;
402  std::map<std::size_t, float> free_energies;
403 
404 
405  std::string fname_next = stringprintf(basename, d_min);
406  if ( ! b_fs::exists(fname_next)) {
407  std::cerr << "error: file does not exist: " << fname_next
408  << " check basename (-b) and --min/--max/--step" << std::endl;
409  exit(EXIT_SUCCESS);
410  }
411  read_comments(fname_next, commentsMap);
412  std::vector<std::size_t> cl_next = read_clustered_trajectory(fname_next);
413  std::vector<std::size_t> cl_now;
414  std::size_t max_id;
415  std::size_t n_rows = cl_next.size();
416  // re-map states to give every state a unique id.
417  // this is necessary, since every initially clustered trajectory
418  // at different thresholds uses the same ids starting with 0.
419  const float prec = d_step / 10.0f;
420  if (d_max == 0.0f) {
421  // default: collect all until MAX_FE
422  d_max = std::numeric_limits<float>::max();
423  } else {
424  d_max += d_step;
425  }
426  float d;
427  Clustering::logger(std::cout) << "~~~ remapping cluster files and generating network" << std::endl;
428  for (d=d_min; ! fuzzy_equal(d, d_max, prec) && b_fs::exists(fname_next); d += d_step) {
429  Clustering::logger(std::cout) << " " << fname_next << " -> "
430  << stringprintf(remapped_name, d)<< std::endl;
431  cl_now = cl_next;
432  fname_next = stringprintf(basename, d + d_step);
433  #pragma omp parallel sections
434  {
435  #pragma omp section
436  {
437  write_clustered_trajectory(stringprintf(remapped_name, d),
438  cl_now,
439  header_comment,
440  commentsMap);
441  }
442  #pragma omp section
443  {
444  if (b_fs::exists(fname_next)) {
445  cl_next = read_clustered_trajectory(fname_next);
446  max_id = *std::max_element(cl_now.begin(), cl_now.end());
447  for (std::size_t i=0; i < n_rows; ++i) {
448  if (cl_next[i] != 0) {
449  cl_next[i] += max_id;
450  if (cl_now[i] != 0) {
451  network[cl_now[i]] = cl_next[i];
452  ++pops[cl_now[i]];
453  free_energies[cl_now[i]] = d;
454  }
455  }
456  }
457  }
458  }
459  }
460  }
461  // set correct value for d_max for later reference
462  d_max = d-d_step;
463  // if minpop given: delete nodes and edges not fulfilling min. population criterium
464  commentsMap["minimal_population"] = minpop;
465  if (minpop > 1) {
466  Clustering::logger(std::cout) << "\n~~~ removing states with population p < "
467  << minpop << std::endl;
468  std::unordered_set<std::size_t> removals;
469  auto pop_it = pops.begin();
470  Clustering::logger(std::cout) << " ... removing nodes" << std::endl;
471  while (pop_it != pops.end()) {
472  if (pop_it->second < minpop) {
473  removals.insert(pop_it->first);
474  pops.erase(pop_it++); // as above
475  } else {
476  ++pop_it;
477  }
478  }
479  Clustering::logger(std::cout) << " ... removing edges" << std::endl;
480  auto net_it = network.begin();
481  while (net_it != network.end()) {
482  std::size_t a = net_it->first;
483  std::size_t b = net_it->second;
484  if (removals.count(a) > 0 || removals.count(b) > 0) {
485  network.erase(net_it++);
486  } else {
487  ++net_it;
488  }
489  }
490  }
491  Clustering::logger(std::cout) << "\n~~~ storing output files" << std::endl;
492  // save links (i.e. edges) as two-column ascii in a child -> parent manner
493  save_network_links(basename_output, network, header_comment, commentsMap);
494  // save id, population and free energy of nodes
495  save_node_info(basename_output, free_energies, pops, header_comment, commentsMap);
496  // compute and directly save network end-nodes (i.e. leaves of the network-tree)
497  std::set<std::size_t> leaves = compute_and_save_leaves(basename_output,
498  network, header_comment, commentsMap);
499  // save the trajectory consisting of the 'leaf-states'.
500  // all non-leaf states are kept as non-assignment state '0'.
501  save_traj_of_leaves(basename_output, leaves,
502  d_min, d_max, d_step, remapped_name, n_rows, header_comment, commentsMap);
503  // generate html-file with embedded javascript to visualize network
504  if (network_html) {
505  save_network_to_html(basename_output, network, free_energies, pops);
506  }
507  }
508 } // end namespace NetworkBuilder
509 } // end namespace Clustering
510 
additional tools used throughout the clustering package
Definition: tools.cpp:33
bool verbose
global flag: use verbose output?
Definition: logger.cpp:29
general namespace for clustering package
Definition: coring.cpp:38
Tools mainly for IO and some other functions.
int main(int argc, char *argv[])
Parses option and execute corresponding sub-module.
Definition: clustering.cpp:66
Define global logger.
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
void read_comments(std::string filename, std::map< std::string, float > &stringMap)
read comments of stringMap from file. Comments should start with #@
Definition: tools.cpp:229
void append_commentsMap(std::string &header_comment, std::map< std::string, float > &stringMap)
append commentsMap to header comment
Definition: tools.cpp:267
void write_clustered_trajectory(std::string filename, std::vector< std::size_t > traj, std::string header_comment, std::map< std::string, float > stringMap)
write state trajectory into plain text file
Definition: tools.cpp:63
Network Builder.
std::vector< std::size_t > read_clustered_trajectory(std::string filename)
read states from trajectory (given as plain text file)
Definition: tools.cpp:58
std::string stringprintf(const std::string &str,...)
printf-version for std::string
Definition: tools.cpp:80