noise.cpp
1 /*
2 Copyright (c) 2015-2019, Florian Sittel (www.lettis.net) and Daniel Nagel
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "noise.hpp"
27 
28 #include "logger.hpp"
29 #include "tools.hpp"
30 
31 #include <iostream>
32 #include <fstream>
33 #include <algorithm>
34 #include <vector>
35 
36 #include <boost/filesystem.hpp>
37 #include <omp.h>
38 
39 namespace Clustering {
40 namespace Noise {
41  void
42  main(boost::program_options::variables_map args) {
43  using namespace Clustering::Tools;
44  namespace b_fs = boost::filesystem;
45  namespace b_po = boost::program_options;
46  // load states
47  Clustering::logger(std::cout) << "~~~ reading files\n"
48  << " trajectory from: " << args["states"].as<std::string>()
49  << std::endl;
50  std::vector<std::size_t> states = Clustering::Tools::read_clustered_trajectory(args["states"].as<std::string>());
51  std::vector<std::size_t> states_without_noise;
52  std::copy(states.begin(), states.end(), std::back_inserter(states_without_noise));
53  std::set<std::size_t> state_names(states.begin(), states.end());
54  std::size_t n_frames = states.size();
55  // shift cmin fomr [0,100] -> [0,1]
56  float cmin = 0.01*args["cmin"].as<float>();
57  std::string basename = args["basename"].as<std::string>();
58  basename.append(".");
59  Clustering::verbose = args["verbose"].as<bool>();
60  // generate header comment
61  std::string header_comment = args["header"].as<std::string>();
62  std::map<std::string,float> commentsMap = args["commentsMap"].as<std::map<std::string,float>>();
63  // read previously used parameters
64  read_comments(args["states"].as<std::string>(), commentsMap);
65  commentsMap["cmin"] = cmin;
66  // noise state is 1 lower than lowest
67  auto lowestState = std::min_element(states.begin(), states.end());
68  std::size_t noiseState = *lowestState-1;
69 
70  if (args.count("output") || args.count("cores")) {
71  // load concatenation limits to treat concatenated trajectories correctly
72  // when performing dynamical corrections
73  std::vector<std::size_t> concat_limits;
74  if (args.count("concat-limits")) {
75  Clustering::logger(std::cout) << " limits from: "
76  << args["concat-limits"].as<std::string>() << std::endl;
77  concat_limits = Clustering::Tools::read_concat_limits(args["concat-limits"].as<std::string>());
78  } else if (args.count("concat-nframes")) {
79  std::size_t n_frames_per_subtraj = args["concat-nframes"].as<std::size_t>();
80  for (std::size_t i=n_frames_per_subtraj; i <= n_frames; i += n_frames_per_subtraj) {
81  concat_limits.push_back(i);
82  }
83  } else {
84  concat_limits = {n_frames};
85  }
86  // check if concat_limits are well definied
87  Clustering::Tools::check_concat_limits(concat_limits, n_frames);
88  // findest highest clust file
89  b_fs::path cwd(b_fs::current_path());
90 
91  typedef std::vector<b_fs::path> vec; // store paths,
92  vec v; // so we can sort them later
93 
94  std::copy(b_fs::directory_iterator(cwd), b_fs::directory_iterator(), std::back_inserter(v));
95 
96  std::sort(v.begin(), v.end()); // sort, since directory iteration
97  // is not ordered on some file systems
98 // std::cout << " basename: " << basename << "\n";
99 // std::cout << " v.end(): " << v.back().string() << "\n";
100  // iterate reverse
101  std::string clust_filename;
102  std::size_t found;
103  for (vec::reverse_iterator it(v.rbegin()), it_end(v.rend()); it != it_end; ++it)
104  {
105  std::ostringstream oss;
106  oss << *it;
107  std::string file_str = oss.str();
108 
109  found = file_str.rfind(basename);
110  if (found!=std::string::npos) {
111  clust_filename = file_str.substr(found,file_str.length()-found-1);
112  break;
113  }
114  }
115  // catch if file not found
116  if (found == std::string::npos) {
117  std::cerr << "\n" << "error (noise): cluster file of type " << basename << " not found\n\n";
118  exit(EXIT_FAILURE);
119  }
120 
121  // open highest clust file
122  header_comment.append(Clustering::Tools::stringprintf("#\n# Execution remarks:\n"
123  "# used for highest cluster file: %s\n", clust_filename.c_str()));
124  Clustering::logger(std::cout) << " highest cluster: " << clust_filename << std::endl;
125  std::vector<std::size_t> clust = Clustering::Tools::read_clustered_trajectory(clust_filename);
126  //TODO: check if parameters are identical to states ones
127  if (n_frames != clust.size()) {
128  std::cerr << "\n" << "error (noise): clust file is not of same length as state trajectory." << "\n\n";
129  exit(EXIT_FAILURE);
130  }
131  // generate counts of each cluster id
132  CounterClustMap counts;
133  for (std::size_t i=0; i < n_frames; ++i) {
134  CounterClustMap::iterator it(counts.find(clust[i]));
135  if (it != counts.end()){
136  it->second++;
137  } else {
138  counts[clust[i]] = 1;
139  }
140  }
141  Clustering::logger(std::cout) << "~~~ noise assignment" << std::endl;
142  std::size_t noise_frames = 0;
143  // define noise frames as state 0
144  for (std::size_t i=0; i < n_frames; ++i) {
145  if (counts[clust[i]] < cmin*n_frames){
146  states[i] = noiseState;
147  ++noise_frames;
148  }
149  }
150  float noise_frames_per = 100.*noise_frames / n_frames;
151  Clustering::logger(std::cout) << Clustering::Tools::stringprintf(" %.2f", noise_frames_per)
152  << "% of frames were identified as noise" << std::endl;
153  header_comment.append(Clustering::Tools::stringprintf("# %.2f", noise_frames_per) +
154  "% of frames were identified as noise\n");
155  // TODO: remove following line. Should we keep with argument?
156  // Clustering::Tools::write_clustered_trajectory("microstatesNoiseDef", states);
157  // noise core trajectory
158  std::vector<std::size_t> noise_traj(n_frames);
159  std::size_t current_core = states[0];
160  std::vector<long> cores(n_frames);
161  std::size_t changed_frames = 0;
162  // honour concatenation limits, i.e. treat every concatenated trajectory-part on its own
163  std::size_t last_limit = 0;
164  for (std::size_t next_limit: concat_limits) {
165  std::size_t next_limit_corrected = std::min(next_limit, n_frames);
166  for (std::size_t i=last_limit; i < next_limit_corrected; ++i) {
167  if (states[i] != noiseState) {
168  current_core = states[i];
169  break;
170  }
171  }
172  for (std::size_t i=last_limit; i < next_limit_corrected; ++i) {
173  // coring window
174  if (states[i] != noiseState) {
175  current_core = states[i];
176  cores[i] = current_core;
177  } else {
178  cores[i] = -1;
179  }
180  if (current_core != states_without_noise[i]) {
181  ++changed_frames;
182  }
183  noise_traj[i] = current_core;
184  }
185  last_limit = next_limit_corrected;
186  }
187  float changed_frames_per = 100.*changed_frames / n_frames;
188  Clustering::logger(std::cout) << Clustering::Tools::stringprintf(" %.2f", changed_frames_per)
189  << "% of frames were reassigned\n"
190  << " store result in: " << args["output"].as<std::string>()
191  << std::endl;
192  header_comment.append(Clustering::Tools::stringprintf("# %.2f", changed_frames_per) +
193  "% of frames were reassigned\n");
194  // write cored trajectory to file
195  if (args.count("output")) {
196  Clustering::Tools::write_clustered_trajectory(args["output"].as<std::string>(),
197  noise_traj,
198  header_comment,
199  commentsMap);
200  }
201  // write core information to file
202  if (args.count("cores")) {
203  append_commentsMap(header_comment, commentsMap);
204  Clustering::Tools::write_single_column<long>(args["cores"].as<std::string>(),
205  cores,
206  header_comment,
207  false);
208  }
209  } else {
210  std::cerr << "\n" << "error (noise): nothing to do! please define '--output' or '--cores'" << "\n\n";
211  exit(EXIT_FAILURE);
212  }
213  }
214 } // end namespace Noise
215 } // end namespace Clustering
216 
std::vector< std::size_t > read_concat_limits(std::string filename)
Definition: tools.cpp:133
additional tools used throughout the clustering package
Definition: tools.cpp:33
bool verbose
global flag: use verbose output?
Definition: logger.cpp:29
general namespace for clustering package
Definition: coring.cpp:38
Tools mainly for IO and some other functions.
Define global logger.
void check_concat_limits(std::vector< std::size_t > concat_limits, std::size_t n_frames)
check if concat limits were passed correctly
Definition: tools.cpp:189
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
void read_comments(std::string filename, std::map< std::string, float > &stringMap)
read comments of stringMap from file. Comments should start with #@
Definition: tools.cpp:229
std::map< int, unsigned int > CounterClustMap
map for sorting clusters by population
Definition: noise.hpp:50
void append_commentsMap(std::string &header_comment, std::map< std::string, float > &stringMap)
append commentsMap to header comment
Definition: tools.cpp:267
void main(boost::program_options::variables_map args)
controlling function and user interface for noise assignment
Definition: noise.cpp:42
void write_clustered_trajectory(std::string filename, std::vector< std::size_t > traj, std::string header_comment, std::map< std::string, float > stringMap)
write state trajectory into plain text file
Definition: tools.cpp:63
Noise Assignment.
std::vector< std::size_t > read_clustered_trajectory(std::string filename)
read states from trajectory (given as plain text file)
Definition: tools.cpp:58
std::string stringprintf(const std::string &str,...)
printf-version for std::string
Definition: tools.cpp:80