cxtream  0.5.1
C++17 data pipeline with Python bindings.
groups.hpp
1 /****************************************************************************
2  * cxtream library
3  * Copyright (c) 2017, Cognexa Solutions s.r.o.
4  * Author(s) Filip Matzner
5  *
6  * This file is distributed under the MIT License.
7  * See the accompanying file LICENSE.txt for the complete license agreement.
8  ****************************************************************************/
10 
11 #ifndef CXTREAM_CORE_GROUPS_HPP
12 #define CXTREAM_CORE_GROUPS_HPP
13 
14 #include <cxtream/core/utility/random.hpp>
15 
16 #include <range/v3/action/insert.hpp>
17 #include <range/v3/action/shuffle.hpp>
18 #include <range/v3/algorithm/all_of.hpp>
19 #include <range/v3/algorithm/copy.hpp>
20 #include <range/v3/numeric/accumulate.hpp>
21 #include <range/v3/view/concat.hpp>
22 #include <range/v3/view/drop.hpp>
23 #include <range/v3/view/filter.hpp>
24 #include <range/v3/view/iota.hpp>
25 #include <range/v3/view/repeat_n.hpp>
26 #include <range/v3/view/take.hpp>
27 
28 #include <vector>
29 
30 namespace cxtream {
31 
47 template<typename Prng = std::mt19937&>
48 std::vector<std::size_t> generate_groups(std::size_t size, std::vector<double> ratio,
49  Prng&& gen = utility::random_generator)
50 {
51  namespace view = ranges::view;
52 
53  // check all ratios non-negative
54  assert(ranges::all_of(ratio, [](double d) { return d >= 0; }));
55 
56  // check positive ratio sum
57  double ratio_sum = ranges::accumulate(ratio, 0.);
58  assert(ratio_sum > 0);
59 
60  // remove trailing zeros
61  ratio.erase(std::find_if(ratio.rbegin(), ratio.rend(), [](double r) { return r > 0; }).base(),
62  ratio.end());
63 
64  // scale to [0, 1]
65  for (double& r : ratio) r /= ratio_sum;
66 
67  std::vector<std::size_t> groups;
68  groups.reserve(size);
69 
70  for (std::size_t i = 0; i < ratio.size(); ++i) {
71  std::size_t count = std::lround(ratio[i] * size);
72  // take all the remaining elements if this is the last non-zero group
73  if (i + 1 == ratio.size()) count = size - groups.size();
74  ranges::action::insert(groups, groups.end(), view::repeat_n(i, count));
75  }
76 
77  ranges::action::shuffle(groups, gen);
78  return groups;
79 }
80 
106 template<typename Prng = std::mt19937&>
107 std::vector<std::vector<std::size_t>>
108 generate_groups(std::size_t n, std::size_t size,
109  const std::vector<double>& volatile_ratio,
110  const std::vector<double>& fixed_ratio,
111  Prng&& gen = utility::random_generator)
112 {
113  namespace view = ranges::view;
114 
115  std::size_t volatile_size = volatile_ratio.size();
116  auto full_ratio = view::concat(volatile_ratio, fixed_ratio);
117 
118  std::vector<std::vector<std::size_t>> all_groups;
119  std::vector<std::size_t> initial_groups = generate_groups(size, full_ratio, gen);
120 
121  for (std::size_t i = 0; i < n; ++i) {
122  auto groups = initial_groups;
123  // select those groups, which are volatile (those will be replaced)
124  auto groups_volatile =
125  groups | view::filter([volatile_size](std::size_t l) { return l < volatile_size; });
126  // count the number of volatile groups
127  std::size_t volatile_count = ranges::distance(groups_volatile);
128  // generate the replacement
129  auto groups_volatile_new = generate_groups(volatile_count, volatile_ratio, gen);
130  // replace
131  ranges::copy(groups_volatile_new, groups_volatile.begin());
132  // store
133  all_groups.emplace_back(std::move(groups));
134  }
135 
136  return all_groups;
137 }
138 
139 } // end namespace cxtream
140 #endif
static thread_local std::mt19937 random_generator
Thread local pseudo-random number generator seeded by std::random_device.
Definition: random.hpp:20
constexpr auto filter(from_t< FromColumns... > f, by_t< ByColumns... > b, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
Filter stream data.
Definition: filter.hpp:127
std::vector< std::vector< std::size_t > > generate_groups(std::size_t n, std::size_t size, const std::vector< double > &volatile_ratio, const std::vector< double > &fixed_ratio, Prng &&gen=utility::random_generator)
Randomly group data into multiple clusters with a given ratio.
Definition: groups.hpp:108