tools.hxx
1 /*
2 Copyright (c) 2015-2019, Florian Sittel (www.lettis.net) and Daniel Nagel
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "tools.hpp"
27 #include "logger.hpp"
28 
29 #include <iostream>
30 #include <fstream>
31 #include <sstream>
32 #include <iterator>
33 #include <map>
34 #include <algorithm>
35 
36 namespace Clustering {
37 namespace Tools {
38 
39 template <typename NUM>
40 std::tuple<NUM*, std::size_t, std::size_t>
41 read_coords(std::string filename, std::vector<std::size_t> usecols) {
42  std::size_t n_rows=0;
43  std::size_t n_cols=0;
44  std::size_t n_cols_used=0;
45  std::ifstream ifs(filename);
46  Clustering::logger(std::cout) << "~~~ reading coordinates" << std::endl;
47  if (ifs.fail()) {
48  std::cerr << "error: cannot open file '" << filename << "'" << std::endl;
49  exit(EXIT_FAILURE);
50  }
51  Clustering::logger(std::cout) << " from file: " << filename << std::endl;
52  {
53  // determine n_cols
54  std::string linebuf;
55  while (linebuf.empty() && ifs.good()) {
56  std::getline(ifs, linebuf);
57  }
58  std::stringstream ss(linebuf);
59  n_cols = std::distance(std::istream_iterator<std::string>(ss),
60  std::istream_iterator<std::string>());
61  // go back to beginning to read complete file
62  ifs.seekg(0);
63  // determine n_rows
64  while (ifs.good()) {
65  std::getline(ifs, linebuf);
66  if ( ! linebuf.empty()) {
67  ++n_rows;
68  }
69  }
70  // go back again
71  ifs.clear();
72  ifs.seekg(0, std::ios::beg);
73  }
74  Clustering::logger(std::cout) << " with dimesions: " << n_rows << "x"
75  << n_cols << "\n" << std::endl;
76  std::map<std::size_t, bool> col_used;
77  if (usecols.size() == 0) {
78  // use all columns
79  n_cols_used = n_cols;
80  for (std::size_t i=0; i < n_cols; ++i) {
81  col_used[i] = true;
82  }
83  } else {
84  // use only defined columns
85  n_cols_used = usecols.size();
86  for (std::size_t i=0; i < n_cols; ++i) {
87  col_used[i] = false;
88  }
89  for (std::size_t i: usecols) {
90  col_used[i] = true;
91  }
92  }
93  // allocate memory
94  // DC_MEM_ALIGNMENT is defined during cmake and
95  // set depending on usage of SSE2, SSE4_1, AVX or Xeon Phi
96  NUM* coords = (NUM*) _mm_malloc(sizeof(NUM)*n_rows*n_cols_used, DC_MEM_ALIGNMENT);
97  ASSUME_ALIGNED(coords);
98  // read data
99  for (std::size_t cur_row = 0; cur_row < n_rows; ++cur_row) {
100  std::size_t cur_col = 0;
101  for (std::size_t i=0; i < n_cols; ++i) {
102  NUM buf;
103  ifs >> buf;
104  if (col_used[i]) {
105  coords[cur_row*n_cols_used + cur_col] = buf;
106  ++cur_col;
107  }
108  }
109  }
110  return std::make_tuple(coords, n_rows, n_cols_used);
111 }
112 
113 
114 template <typename NUM>
115 void
116 free_coords(NUM* coords) {
117  _mm_free(coords);
118 }
119 
120 template <typename NUM>
121 std::vector<NUM>
122 dim1_sorted_coords(const NUM* coords
123  , std::size_t n_rows
124  , std::size_t n_cols) {
125  std::vector<NUM> sorted_coords(n_rows*n_cols);
126  if (n_cols == 1) {
127  // directly sort on data if just one column
128  for (std::size_t i=0; i < n_rows; ++i) {
129  sorted_coords[i] = coords[i];
130  }
131  std::sort(sorted_coords.begin(), sorted_coords.end());
132  } else {
133  std::vector<std::vector<NUM>> c_tmp(n_rows
134  , std::vector<float>(n_cols));
135  for (std::size_t i=0; i < n_rows; ++i) {
136  for (std::size_t j=0; j < n_cols; ++j) {
137  c_tmp[i][j] = coords[i*n_cols+j];
138  }
139  }
140  // sort on first index
141  std::sort(c_tmp.begin()
142  , c_tmp.end()
143  , [] (const std::vector<NUM>& lhs
144  , const std::vector<NUM>& rhs) {
145  return lhs[0] < rhs[0];
146  });
147  // feed sorted data into 1D-array
148  for (std::size_t i=0; i < n_rows; ++i) {
149  for (std::size_t j=0; j < n_cols; ++j) {
150  sorted_coords[i*n_cols+j] = c_tmp[i][j];
151  }
152  }
153  }
154  return sorted_coords;
155 }
156 
157 template <typename NUM>
158 std::vector<NUM>
159 boxlimits(const std::vector<NUM>& xs
160  , std::size_t boxsize
161  , std::size_t n_rows
162  , std::size_t n_cols) {
163  //std::size_t n_xs = xs.size() / n_dim;
164  std::size_t n_boxes = n_rows / boxsize;
165  if (n_boxes * boxsize < n_rows) {
166  ++n_boxes;
167  }
168  std::vector<NUM> boxlimits(n_boxes);
169  for (std::size_t i=0; i < n_boxes; ++i) {
170  // split into boxes on 1st dimension
171  // (i.e. col-index == 0)
172  boxlimits[i] = xs[i*boxsize*n_cols];
173  }
174  return boxlimits;
175 }
176 
177 template <typename NUM>
178 std::pair<std::size_t, std::size_t>
179 min_max_box(const std::vector<NUM>& limits
180  , NUM val
181  , NUM radius) {
182  std::size_t n_boxes = limits.size();
183  if (n_boxes == 0) {
184  return {0,0};
185  } else {
186  std::size_t i_min = n_boxes - 1;
187  std::size_t i_max = 0;
188  NUM lbound = val - radius;
189  NUM ubound = val + radius;
190  for (std::size_t i=1; i < n_boxes; ++i) {
191  if (lbound < limits[i]) {
192  i_min = i-1;
193  break;
194  }
195  }
196  for (std::size_t i=n_boxes; 0 < i; --i) {
197  if (limits[i-1] < ubound) {
198  i_max = i-1;
199  break;
200  }
201  }
202  return {i_min, i_max};
203  }
204 }
205 
206 
207 template <typename KEY, typename VAL>
208 void
209 write_map(std::string filename, std::map<KEY, VAL> mapping,
210  std::string header_comment, bool val_then_key) {
211  std::ofstream ofs(filename);
212  if (ofs.fail()) {
213  std::cerr << "error: cannot open file '" << filename << "' for writing." << std::endl;
214  exit(EXIT_FAILURE);
215  }
216  ofs << header_comment;
217  if (val_then_key) {
218  for (auto key_val: mapping) {
219  ofs << key_val.second << " " << key_val.first << "\n";
220  }
221  } else {
222  for (auto key_val: mapping) {
223  ofs << key_val.first << " " << key_val.second << "\n";
224  }
225  }
226 }
227 
228 template <typename NUM>
229 std::vector<NUM>
230 read_single_column(std::string filename) {
231  std::vector<NUM> dat;
232  std::ifstream ifs(filename);
233  if (ifs.fail()) {
234  std::cerr << "error: cannot open file '" << filename << "'" << std::endl;
235  exit(EXIT_FAILURE);
236  } else {
237  while (!ifs.eof() && !ifs.bad()) {
238  NUM buf;
239  ifs >> buf;
240  if ( ! ifs.fail()) {
241  dat.push_back(buf);
242  } else { // if conversion error, skip (comment) line
243  ifs.clear();
244  ifs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
245  }
246  }
247  }
248  if (dat.empty()) {
249  std::cerr << "error: opened empty file '" << filename << "'" << std::endl;
250  exit(EXIT_FAILURE);
251  }
252  return dat;
253 }
254 
255 
256 template <typename NUM>
257 void
258 write_single_column(std::string filename, std::vector<NUM> dat,
259  std::string header_comment, bool with_scientific_format) {
260  std::ofstream ofs(filename);
261  if (ofs.fail()) {
262  std::cerr << "error: cannot open file '" << filename << "' for writing." << std::endl;
263  exit(EXIT_FAILURE);
264  }
265  ofs << header_comment;
266  if (with_scientific_format) {
267  ofs << std::scientific;
268  }
269  for (NUM i: dat) {
270  ofs << i << "\n";
271  }
272 }
273 
274 template <typename NUM>
275 NUM
276 string_to_num(const std::string &s) {
277  std::stringstream ss(s);
278  NUM buf;
279  ss >> buf;
280  return buf;
281 }
282 
283 template <typename T>
284 std::vector<T>
285 unique_elements(std::vector<T> xs) {
286  std::sort(xs.begin()
287  , xs.end());
288  auto last = std::unique(xs.begin()
289  , xs.end());
290  xs.erase(last
291  , xs.end());
292  return xs;
293 }
294 
295 
296 } // end namespace Tools
297 } // end namespace Clustering
298 
std::tuple< NUM *, std::size_t, std::size_t > read_coords(std::string filename, std::vector< std::size_t > usecols=std::vector< std::size_t >())
Definition: tools.hxx:41
general namespace for clustering package
Definition: coring.cpp:38
Tools mainly for IO and some other functions.
#define ASSUME_ALIGNED(c)
needed for aligned memory allocation for Xeon Phi, SSE or AVX
Definition: tools.hpp:47
std::vector< NUM > dim1_sorted_coords(const NUM *coords, std::size_t n_rows, std::size_t n_cols)
Definition: tools.hxx:122
Define global logger.
void free_coords(NUM *coords)
free memory pointing to coordinates
Definition: tools.hxx:116
NUM string_to_num(const std::string &s)
convert std::string to number of given template format
Definition: tools.hxx:276
void write_map(std::string filename, std::map< KEY, VAL > mapping, std::string header_comment, bool val_then_key=false)
write key-value map to plain text file with key as first and value as second column ...
Definition: tools.hxx:209
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
std::vector< T > unique_elements(std::vector< T > xs)
return distinct elements of vector
Definition: tools.hxx:285
void write_single_column(std::string filename, std::vector< NUM > dat, std::string header_comment, bool with_scientific_format=false)
write single column of numbers to given file. number type (int, float, ...) given as template paramet...
Definition: tools.hxx:258
std::vector< NUM > boxlimits(const std::vector< NUM > &xs, std::size_t boxsize, std::size_t n_rows, std::size_t n_cols)
Definition: tools.hxx:159
std::vector< NUM > read_single_column(std::string filename)
read single column of numbers from given file. number type (int, float, ...) given as template parame...
Definition: tools.hxx:230
std::pair< std::size_t, std::size_t > min_max_box(const std::vector< NUM > &limits, NUM val, NUM radius)
return indices of min and max boxes around value for given radius.
Definition: tools.hxx:179