cxtream  0.5.1
C++17 data pipeline with Python bindings.
Classes | Macros | Functions | Variables
Stream modifiers and data types.

Classes

class  cxtream::stream::column_base< T, bool >
 Base class for cxtream columns. More...
 

Macros

#define CXTREAM_DEFINE_COLUMN(col_name, col_type)
 Macro for fast column definition. More...
 

Functions

template<typename... FromColumns, typename... ByColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::filter (from_t< FromColumns... > f, by_t< ByColumns... > b, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
 Filter stream data. More...
 
template<typename... FromColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::for_each (from_t< FromColumns... > f, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
 Apply a function to a subset of stream columns. More...
 
template<typename FromColumn , typename ToColumn , typename Gen , int Dim = utility::ndims<typename ToColumn::batch_type>::value - utility::ndims<std::result_of_t<Gen()>>::value>
constexpr auto cxtream::stream::generate (from_t< FromColumn > size_from, to_t< ToColumn > fill_to, Gen gen, long gendims=std::numeric_limits< long >::max(), dim_t< Dim > d=dim_t< Dim >{})
 Fill the selected column using a generator (i.e., a nullary function). More...
 
template<typename FromColumn , typename MaskColumn , typename ValT = typename utility::ndim_type_t< typename FromColumn::batch_type, utility::ndims<typename MaskColumn::batch_type>::value>>
constexpr auto cxtream::stream::pad (from_t< FromColumn > f, mask_t< MaskColumn > m, ValT value=ValT{})
 Pad the selected column to a rectangular size. More...
 
template<typename FromColumn , typename ToColumn , typename Prng = std::mt19937, typename Dist = std::uniform_real_distribution<double>, int Dim = utility::ndims<typename ToColumn::batch_type>::value - utility::ndims<std::result_of_t<Dist(Prng&)>>::value>
constexpr auto cxtream::stream::random_fill (from_t< FromColumn > size_from, to_t< ToColumn > fill_to, long rnddims=std::numeric_limits< long >::max(), Dist dist=Dist{0, 1}, Prng &prng=cxtream::utility::random_generator, dim_t< Dim > d=dim_t< Dim >{})
 Fill the selected column of a stream with random values. More...
 
template<typename... FromColumns, typename... ToColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::transform (from_t< FromColumns... > f, to_t< ToColumns... > t, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
 Transform a subset of cxtream columns to a different subset of cxtream columns. More...
 
template<typename... FromColumns, typename... ToColumns, typename CondColumn , typename Fun , int Dim = 1>
constexpr auto cxtream::stream::transform (from_t< FromColumns... > f, to_t< ToColumns... > t, cond_t< CondColumn > c, Fun fun, dim_t< Dim > d=dim_t< 1 >{})
 Conditional transform of a subset of cxtream columns. More...
 
template<typename... FromColumns, typename... ToColumns, typename Fun , typename Prng = std::mt19937, int Dim = 1>
constexpr auto cxtream::stream::transform (from_t< FromColumns... > f, to_t< ToColumns... > t, double prob, Fun fun, Prng &prng=utility::random_generator, dim_t< Dim > d=dim_t< 1 >{})
 Probabilistic transform of a subset of cxtream columns. More...
 
template<typename Rng , typename... FromColumns, int Dim = 1>
constexpr auto cxtream::stream::unpack (Rng &&rng, from_t< FromColumns... > f, dim_t< Dim > d=dim_t< 1 >{})
 Unpack a stream into a tuple of ranges. More...
 

Variables

constexpr ranges::view::view< batch_fn > cxtream::stream::batch {}
 Accumulate the stream and yield batches of a different size. More...
 
constexpr ranges::view::view< buffer_fn > cxtream::stream::buffer {}
 Asynchronously buffers the given range. More...
 
template<typename... Columns>
constexpr ranges::view::view< detail::create_fn< Columns... > > cxtream::stream::create {}
 Converts a range to a stream (i.e., to a range of tuples of columns). More...
 
template<typename... Columns>
constexpr ranges::view::view< detail::drop_fn< Columns... > > cxtream::stream::drop {}
 Drops columns from a stream. More...
 

Detailed Description

Macro Definition Documentation

◆ CXTREAM_DEFINE_COLUMN

#define CXTREAM_DEFINE_COLUMN (   col_name,
  col_type 
)
Value:
struct col_name : cxtream::stream::column_base<col_type> { \
static constexpr const char* name() { return #col_name; } \
};
Base class for cxtream columns.
Definition: column.hpp:24

Macro for fast column definition.

Under the hood, it creates a new type derived from column_base.

Definition at line 90 of file column.hpp.

Function Documentation

◆ filter()

template<typename... FromColumns, typename... ByColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::filter ( from_t< FromColumns... >  f,
by_t< ByColumns... >  b,
Fun  fun,
dim_t< Dim >  d = dim_t<1>{} 
)

Filter stream data.

Example:

CXTREAM_DEFINE_COLUMN(value, double)
std::vector<std::tuple<int, double>> data = {{3, 5.}, {1, 2.}};
auto rng = data
| create<id, value>()
| filter(from<id, value>, by<value>, [](double value) { return value > 3.; });
Parameters
fThe columns to be filtered.
bThe columns to be passed to the filtering function. Those have to be a subset of f.
funThe filtering function returning a boolean.
dThe dimension in which the function is applied. Choose 0 to filter whole batches (in such a case, the f parameter is ignored).

Definition at line 127 of file filter.hpp.

◆ for_each()

template<typename... FromColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::for_each ( from_t< FromColumns... >  f,
Fun  fun,
dim_t< Dim >  d = dim_t<1>{} 
)

Apply a function to a subset of stream columns.

The given function is applied to a subset of columns given by FromColumns. The transformed range is the same as the input range, no elements are actually changed. The function is applied lazily, i.e., only when the range is iterated.

Example:

CXTREAM_DEFINE_COLUMN(Double, double)
std::vector<std::tuple<Int, Double>> data = {{3, 5.}, {1, 2.}};
auto rng = data
| for_each(from<Int, Double>, [](int& v, double& d) { std::cout << c + d; });
Parameters
fThe columns to be exctracted out of the tuple of columns and passed to fun.
funThe function to be applied.
dThe dimension in which the function is applied. Choose 0 for the function to be applied to the whole batch.

Definition at line 59 of file for_each.hpp.

◆ generate()

template<typename FromColumn , typename ToColumn , typename Gen , int Dim = utility::ndims<typename ToColumn::batch_type>::value - utility::ndims<std::result_of_t<Gen()>>::value>
constexpr auto cxtream::stream::generate ( from_t< FromColumn >  size_from,
to_t< ToColumn >  fill_to,
Gen  gen,
long  gendims = std::numeric_limits<long>::max(),
dim_t< Dim >  d = dim_t<Dim>{} 
)

Fill the selected column using a generator (i.e., a nullary function).

This function uses utility::generate(). Furthermore, the column to be filled is first resized so that it has the same size as the selected source column.

Tip: If there is no column the size could be taken from, than just resize the target column manually and use it as both from column and to column.

Example:

CXTREAM_DEFINE_COLUMN(value, double)
std::vector<int> data = {3, 1, 2};
auto rng = data
| create<id>()
// assign each id a value from an increasing sequence
| generate(from<id>, to<value>, [i = 0]() mutable { return i++; });
Parameters
size_fromThe column whose size will be used to initialize the generated column.
fill_toThe column to be filled using the generator.
genThe generator to be used.
gendimsThe number of generated dimensions. See utility::generate().
dThis is the dimension in which will the generator be applied. E.g., if set to 1, the generator result is considered to be a single example. The default is ndims<ToColumn::batch_type> - ndims<gen()>. This value has to be positive.

Definition at line 77 of file generate.hpp.

◆ pad()

template<typename FromColumn , typename MaskColumn , typename ValT = typename utility::ndim_type_t< typename FromColumn::batch_type, utility::ndims<typename MaskColumn::batch_type>::value>>
constexpr auto cxtream::stream::pad ( from_t< FromColumn >  f,
mask_t< MaskColumn >  m,
ValT  value = ValT{} 
)

Pad the selected column to a rectangular size.

Each batch is padded separately.

The mask of the padded values is created along with the padding. The mask evaluates to true on the positions with the original elements and to false on the positions of the padded elements. The mask column should be a multidimensional vector of type bool/char/int/... The dimensionality of the mask column is used to deduce how many dimensions should be padded in the source column.

This transformer internally uses utility::ndim_pad().

Example:

CXTREAM_DEFINE_COLUMN(sequences, std::vector<int>)
CXTREAM_DEFINE_COLUMN(sequence_masks, std::vector<bool>)
std::vector<std::vector<int>> data = {{1, 2}, {3, 4, 5}, {}, {6, 7}};
auto rng = data
| create<sequences>(2)
| pad(from<sequences>, mask<sequence_masks>, -1);
// sequences_batch_1 == {{1, 2, -1}, {3, 4, 5}}
// sequences_batch_2 == {{-1, -1}, {6, 7}}
// sequence_masks_batch_1 == {{true, true, false}, {true, true, true}}
// sequence_masks_batch_2 == {{false, false}, {true, true}}
Parameters
fThe column to be padded.
mThe column where the mask should be stored and from which the dimension is taken.
valueThe value to pad with.

Definition at line 89 of file pad.hpp.

◆ random_fill()

template<typename FromColumn , typename ToColumn , typename Prng = std::mt19937, typename Dist = std::uniform_real_distribution<double>, int Dim = utility::ndims<typename ToColumn::batch_type>::value - utility::ndims<std::result_of_t<Dist(Prng&)>>::value>
constexpr auto cxtream::stream::random_fill ( from_t< FromColumn >  size_from,
to_t< ToColumn >  fill_to,
long  rnddims = std::numeric_limits<long>::max(),
Dist  dist = Dist{0, 1},
Prng &  prng = cxtream::utility::random_generator,
dim_t< Dim >  d = dim_t<Dim>{} 
)

Fill the selected column of a stream with random values.

This function uses stream::generate() and has a similar semantics. That is, the column to be filled is first resized so that it has the same size as the selected source column.

Tip: If there is no column the size could be taken from, than just resize the target column manually and use it as both from column and to column.

Example:

CXTREAM_DEFINE_COLUMN(value, double)
std::vector<int> data = {3, 1, 2};
auto rng = data
| create<id>()
| random_fill(from<id>, to<value>);
| transform(from<id, value>, [](...){ ... });
Parameters
size_fromThe column whose size will be used to initialize the random column.
fill_toThe column to be filled with random data.
rnddimsThe number of random dimensions. See utility::random_fill().
distThe random distribution to be used. This object is copied on every use to avoid race conditions with stream::buffer().
prngThe random generator to be used.
dThis is the dimension in which will the generator be applied. E.g., if set to 1, the generator result is considered to be a single example. The default is ndims<ToColumn::batch_type> - ndims<dist(prng)>. This value has to be positive.

Definition at line 55 of file random_fill.hpp.

◆ transform() [1/3]

template<typename... FromColumns, typename... ToColumns, typename Fun , int Dim = 1>
constexpr auto cxtream::stream::transform ( from_t< FromColumns... >  f,
to_t< ToColumns... >  t,
Fun  fun,
dim_t< Dim >  d = dim_t<1>{} 
)

Transform a subset of cxtream columns to a different subset of cxtream columns.

Example:

CXTREAM_DEFINE_COLUMN(value, double)
std::vector<std::tuple<int, double>> data = {{3, 5.}, {1, 2.}};
auto rng = data
| create<id, value>()
| transform(from<id>, to<value>, [](int id) { return id * 5. + 1.; });
Parameters
fThe columns to be extracted out of the tuple of columns and passed to fun.
tThe columns where the result will be saved. If the stream does not contain the selected columns, they are added to the stream. This parameter can overlap with the parameter f.
funThe function to be applied. The function should return the type represented by the target column in the given dimension. If there are multiple target columns, the function should return a tuple of the corresponding types.
dThe dimension in which is the function applied. Choose 0 for the function to be applied to the whole batch.

Definition at line 177 of file transform.hpp.

◆ transform() [2/3]

template<typename... FromColumns, typename... ToColumns, typename CondColumn , typename Fun , int Dim = 1>
constexpr auto cxtream::stream::transform ( from_t< FromColumns... >  f,
to_t< ToColumns... >  t,
cond_t< CondColumn >  c,
Fun  fun,
dim_t< Dim >  d = dim_t<1>{} 
)

Conditional transform of a subset of cxtream columns.

This function behaves the same as the original stream::transform(), but it accepts one extra argument denoting a column of true/false values of the same shape as the columns to be transformed. The transformation will only be applied on true values and it will be an identity on false values.

Note that this can be very useful in combination with stream::random_fill() and std::bernoulli_distribution.

Example:

CXTREAM_DEFINE_COLUMN(do_trans, char) // do not use bool here, vector<bool> is
// not a good OutputRange
std::vector<int> data_int = {3, 1, 5, 7};
// hardcoded usage
std::vector<int> data_cond = {true, true, false, false};
auto rng = ranges::view::zip(data_int, data_cond)
| create<dogs, do_trans>()
// this transforms only the first two examples and does nothing for the last two
| transform(from<dogs>, to<dogs>, cond<do_trans>, [](int dog) { return dog + 1; })
// this transformation reverts the previous one
| transform(from<dogs>, to<dogs>, cond<do_trans>, [](int dog) { return dog - 1; });
// random_fill usage
std::bernoulli_distribution dist{0.5};
auto rng2 = data_int
| create<dogs>()
| random_fill(from<dogs>, to<do_trans>, 1, dist, prng)
// the transformation of each example is performed with 50% probability
| transform(from<dogs>, to<dogs>, cond<do_trans>, [](int dog) { return dog + 1; })
// this transformation reverts the previous one
| transform(from<dogs>, to<dogs>, cond<do_trans>, [](int dog) { return dog - 1; });
Parameters
fThe columns to be extracted out of the tuple of columns and passed to fun.
tThe columns where the result will be saved. Those have to already exist in the stream.
cThe column of true/false values denoting whether the transformation should be performed or not. For false values, the transformation is an identity on the target columns.
funThe function to be applied. The function should return the type represented by the selected column in the given dimension. If there are multiple target columns, the function should return a tuple of the corresponding types.
dThe dimension in which is the function applied. Choose 0 for the function to be applied to the whole batch.

Definition at line 284 of file transform.hpp.

◆ transform() [3/3]

template<typename... FromColumns, typename... ToColumns, typename Fun , typename Prng = std::mt19937, int Dim = 1>
constexpr auto cxtream::stream::transform ( from_t< FromColumns... >  f,
to_t< ToColumns... >  t,
double  prob,
Fun  fun,
Prng &  prng = utility::random_generator,
dim_t< Dim >  d = dim_t<1>{} 
)

Probabilistic transform of a subset of cxtream columns.

This function behaves the same as the original stream::transform(), but it accepts one extra argument denoting the probability of transformation. If this probability is 0.0, the transformer behaves as an identity. If it is 1.0, the transofrmation function is always applied.

Example:

std::vector<int> data = {3, 1, 5, 7};
auto rng = data
| create<dogs>()
// In 50% of the cases, the number of dogs increase,
// and in the other 50% of the cases, it stays the same.
| transform(from<dogs>, to<dogs>, 0.5, [](int dog) { return dog + 1; });
Parameters
fThe columns to be extracted out of the tuple of columns and passed to fun.
tThe columns where the result will be saved. Those have to already exist in the stream.
probThe probability of transformation. If the dice roll fails, the transformer applies an identity on the target columns.
funThe function to be applied. The function should return the type represented by the selected column in the given dimension. If there are multiple target columns, the function should return a tuple of the corresponding types.
prngThe random generator to be used. Defaults to a thread_local std::mt19937.
dThe dimension in which is the function applied. Choose 0 for the function to be applied to the whole batch.

Definition at line 393 of file transform.hpp.

◆ unpack()

template<typename Rng , typename... FromColumns, int Dim = 1>
constexpr auto cxtream::stream::unpack ( Rng &&  rng,
from_t< FromColumns... >  f,
dim_t< Dim >  d = dim_t<1>{} 
)

Unpack a stream into a tuple of ranges.

This operation transforms the stream (i.e., a range of tuples of columns) into a tuple of the types represented by the columns. The data can be unpacked in a specific dimension and then the higher dimensions are joined together.

If there is only a single column to be unpacked, the result is an std::vector of the corresponding type. If there are multiple columns to be unpacked, the result is a tuple of std::vectors.

Example:

CXTREAM_DEFINE_COLUMN(values, std::vector<double>)
std::vector<std::tuple<int, std::vector<double>>> data = {{3, {5., 7.}}, {1, {2., 4.}}};
auto rng = data | create<id, values>(4);
// unpack in the first dimesion
std::vector<int> unp_ids;
std::vector<std::vector<double>> unp_values;
std::tie(unp_ids, unp_values) = unpack(rng, from<id, values>);
// unp_ids == {3, 1}
// unp_values == {{5., 7.}, {2., 4.}}
// unpack a single column in the second dimesion
std::vector<double> unp_values_dim2;
unp_values_dim2 = unpack(rng, from<values>, dim<2>);
// unp_values_dim2 == {5., 7., 2., 4.}

Definition at line 100 of file unpack.hpp.

Variable Documentation

◆ batch

constexpr ranges::view::view<batch_fn> cxtream::stream::batch {}

Accumulate the stream and yield batches of a different size.

The batch size of the accumulated columns is allowed to differ between batches. To make one large batch of all the data, use std::numeric_limits<std::size_t>::max().

auto rng = view::iota(0, 10)
| create<value>(2) // batches the data by two examples
| batch(3); // changes the batch size to three examples

Definition at line 200 of file batch.hpp.

◆ buffer

constexpr ranges::view::view<buffer_fn> cxtream::stream::buffer {}

Asynchronously buffers the given range.

Asynchronously evaluates the given number of elements in advance. When queried for the next element, it is already prepared. This view works for any range, not only for cxtream streams.

std::vector<int> data = {1, 2, 3, 4, 5};
auto buffered_rng = data
| ranges::view::transform([](int v) { return v + 1; })
| buffer(2);

Definition at line 161 of file buffer.hpp.

◆ create

template<typename... Columns>
constexpr ranges::view::view<detail::create_fn<Columns...> > cxtream::stream::create {}

Converts a range to a stream (i.e., to a range of tuples of columns).

The value type of the input range is supposed to be either the type represented by the column to be created, or a tuple of such types if there are more columns to be created.

Example:

// rng is a stream where each batch is a single element from 0..9
auto rng = view::iota(0, 10) | create<id>();
// batched_rng is a stream with a single batch with numbers 0..9
auto rng = view::iota(0, 10) | create<id>(50);
// also multiple columns can be created at once
auto rng = view::zip(view::iota(0, 10), view::iota(30, 50)) | create<id, age>();
Parameters
batch_sizeThe requested batch size for the provided data.

Definition at line 89 of file create.hpp.

◆ drop

template<typename... Columns>
constexpr ranges::view::view<detail::drop_fn<Columns...> > cxtream::stream::drop {}

Drops columns from a stream.

Example:

CXTREAM_DEFINE_COLUMN(value, double)
std::vector<std::tuple<int, double>> data = {{3, 5.}, {1, 2.}};
auto rng = data | create<id, value>() | drop<id>;

Definition at line 77 of file drop.hpp.