cxtream  0.5.1
C++17 data pipeline with Python bindings.
csv.hpp
1 /****************************************************************************
2  * cxtream library
3  * Copyright (c) 2017, Cognexa Solutions s.r.o.
4  * Author(s) Filip Matzner
5  *
6  * This file is distributed under the MIT License.
7  * See the accompanying file LICENSE.txt for the complete license agreement.
8  ****************************************************************************/
10 
11 #ifndef CXTREAM_CORE_CSV_HPP
12 #define CXTREAM_CORE_CSV_HPP
13 
14 #include <cxtream/core/dataframe.hpp>
15 
16 #include <boost/algorithm/string.hpp>
17 #include <range/v3/algorithm/find_first_of.hpp>
18 #include <range/v3/view/drop.hpp>
19 #include <range/v3/view/move.hpp>
20 
21 #include <cctype>
22 #include <climits>
23 #include <deque>
24 #include <experimental/filesystem>
25 #include <fstream>
26 #include <iomanip>
27 #include <iostream>
28 #include <vector>
29 
30 namespace cxtream {
31 
32 namespace detail {
33 
34  // Read and discard blank characters (similar to std::ws, but uses std::isblank).
35  std::istream& blanks(std::istream& in)
36  {
37  while (std::isblank(in.peek())) in.get();
38  return in;
39  }
40 
41 } // namespace detail
42 
57 class csv_istream_range : public ranges::view_facade<csv_istream_range> {
58 private:
60  friend ranges::range_access;
62  using single_pass = std::true_type;
63  enum class RowPosition{Normal, Last, End};
64 
65  std::istream* in_;
66  char separator_;
67  char quote_;
68  char escape_;
69 
70  std::vector<std::string> row_;
71  RowPosition row_position_ = RowPosition::Normal;
72 
73  class cursor {
74  private:
75  csv_istream_range* rng_;
76 
77  public:
78  cursor() = default;
79  explicit cursor(csv_istream_range& rng) noexcept
80  : rng_{&rng}
81  {}
82 
83  void next()
84  {
85  rng_->next();
86  }
87 
88  std::vector<std::string>& read() const noexcept
89  {
90  return rng_->row_;
91  }
92 
93  std::vector<std::string>&& move() const noexcept
94  {
95  return std::move(rng_->row_);
96  }
97 
98  bool equal(ranges::default_sentinel) const noexcept
99  {
100  return rng_->row_position_ == RowPosition::End;
101  }
102  };
103 
104  // parse csv field and return whether the next separator is found
105  std::tuple<std::string, bool> parse_field()
106  {
107  std::string field;
108  char c;
109  while (in_->get(c)) {
110  if (c == separator_) {
111  return {std::move(field), true};
112  } else if (c == '\n') {
113  return {std::move(field), false};
114  }
115  field.push_back(c);
116  }
117  return {std::move(field), false};
118  }
119 
120  // parse csv row
121  void next()
122  {
123  if (!in_->good() || row_position_ == RowPosition::Last) {
124  row_position_ = RowPosition::End;
125  return;
126  }
127 
128  // temporarily set badbit exception mask
129  auto orig_exceptions = in_->exceptions();
130  in_->exceptions(orig_exceptions | std::istream::badbit);
131 
132  row_.clear();
133  bool has_next = true;
134  while (has_next && *in_ >> detail::blanks) {
135  std::string field;
136  // process quoted fields
137  if (in_->peek() == quote_) {
138  *in_ >> std::quoted(field, quote_, escape_);
139  if (in_->fail()) throw std::ios_base::failure{"Error while reading CSV field."};
140  std::tie(std::ignore, has_next) = parse_field();
141  }
142  // process unquoted fields
143  else {
144  std::tie(field, has_next) = parse_field();
145  boost::trim(field);
146  }
147  row_.push_back(std::move(field));
148  }
149 
150  // detect whether end of file is reached
151  *in_ >> std::ws;
152  in_->peek();
153  if (!in_->good()) {
154  row_position_ = RowPosition::Last;
155  }
156 
157  // reset exception mask
158  in_->exceptions(orig_exceptions);
159  }
160 
161  cursor begin_cursor()
162  {
163  return cursor{*this};
164  }
165 
166 public:
167  csv_istream_range() = default;
168 
169  explicit csv_istream_range(std::istream& in,
170  char separator = ',',
171  char quote = '"',
172  char escape = '\\')
173  : in_{&in}
174  , separator_{separator}
175  , quote_{quote}
176  , escape_{escape}
177  {
178  next();
179  }
180 };
181 
193 dataframe<> read_csv(std::istream& in,
194  int drop = 0,
195  bool has_header = true,
196  char separator = ',',
197  char quote = '"',
198  char escape = '\\')
199 {
200  // header
201  std::vector<std::string> header;
202  // data
203  std::vector<std::vector<std::string>> data;
204  // load csv line by line
205  auto csv_rows =
206  csv_istream_range(in, separator, quote, escape)
208  | ranges::view::move;
209  auto csv_row_it = ranges::begin(csv_rows);
210  // load header if requested
211  std::size_t n_cols = -1;
212  if (has_header) {
213  if (csv_row_it == ranges::end(csv_rows)) {
214  throw std::ios_base::failure{"There has to be at least the header row."};
215  }
216  std::vector<std::string> csv_row = *csv_row_it;
217  n_cols = ranges::size(csv_row);
218  header = std::move(csv_row);
219  data.resize(n_cols);
220  ++csv_row_it;
221  }
222  // load data
223  for (std::size_t i = 0; csv_row_it != ranges::end(csv_rows); ++csv_row_it, ++i) {
224  std::vector<std::string> csv_row = *csv_row_it;
225  // sanity check row size
226  if (i == 0) {
227  if (has_header) {
228  if (ranges::size(csv_row) != n_cols) {
229  throw std::ios_base::failure{"The first row must have the same "
230  "length as the header."};
231  }
232  } else {
233  n_cols = ranges::size(csv_row);
234  data.resize(n_cols);
235  }
236  } else {
237  if (ranges::size(csv_row) != n_cols) {
238  throw std::ios_base::failure{"Row " + std::to_string(i)
239  + " has a different length "
240  + "(has: " + std::to_string(ranges::size(csv_row))
241  + " , expected: " + std::to_string(n_cols)
242  + ")."};
243  }
244  }
245  // store columns
246  for (std::size_t j = 0; j < ranges::size(csv_row); ++j) {
247  data[j].push_back(std::move(csv_row[j]));
248  }
249  }
250  return {std::move(data), std::move(header)};
251 }
252 
256 dataframe<> read_csv(const std::experimental::filesystem::path& file,
257  int drop = 0,
258  bool header = true,
259  char separator = ',',
260  char quote = '"',
261  char escape = '\\')
262 {
263  std::ifstream fin{file};
264  if (!fin) {
265  throw std::ios_base::failure{"Cannot open " + file.string() + " CSV file for reading."};
266  }
267  return read_csv(fin, drop, header, separator, quote, escape);
268 }
269 
270 namespace detail {
271 
272  inline bool trimmable(const std::string& str)
273  {
274  if (str.length() == 0) return false;
275  return std::isspace(str.front()) || std::isspace(str.back());
276  }
277 
278 } // namespace detail
279 
286 template <typename Row>
287 std::ostream& write_csv_row(std::ostream& out,
288  Row&& row,
289  char separator = ',',
290  char quote = '"',
291  char escape = '\\')
292 {
293  // temporarily set badbit exception mask
294  auto orig_exceptions = out.exceptions();
295  out.exceptions(orig_exceptions | std::ostream::badbit);
296 
297  for (std::size_t i = 0; i < ranges::size(row); ++i) {
298  auto& field = row[i];
299  // output quoted string if it contains separator, double quote, newline or
300  // starts or ends with a whitespace
301  if (ranges::find_first_of(field, {separator, quote, '\n'}) != ranges::end(field)
302  || detail::trimmable(field)) {
303  out << std::quoted(field, quote, escape);
304  } else {
305  out << field;
306  }
307 
308  // output separator or newline
309  if (i + 1 < ranges::size(row)) out << separator;
310  else out << '\n';
311  }
312 
313  out.exceptions(orig_exceptions);
314  return out;
315 }
316 
323 template <typename DataTable>
324 std::ostream& write_csv(std::ostream& out,
325  const dataframe<DataTable>& df,
326  char separator = ',',
327  char quote = '"',
328  char escape = '\\')
329 {
330  write_csv_row(out, df.header(), separator, quote, escape);
331  for (auto&& row : df.raw_rows()) {
332  write_csv_row(out, row, separator, quote, escape);
333  }
334  return out;
335 }
336 
340 template <typename DataTable>
341 void write_csv(const std::experimental::filesystem::path& file,
342  const dataframe<DataTable>& df,
343  char separator = ',',
344  char quote = '"',
345  char escape = '\\')
346 {
347  std::ofstream fout{file};
348  if (!fout) {
349  throw std::ios_base::failure{"Cannot open " + file.string() + " CSV file for writing."};
350  }
351  write_csv(fout, df, separator, quote, escape);
352 }
353 
354 } // namespace cxtream
355 #endif
constexpr ranges::view::view< detail::drop_fn< Columns... > > drop
Drops columns from a stream.
Definition: drop.hpp:77
std::string to_string(const T &value)
Convert the given type to std::string.
Definition: string.hpp:91
Tabular object with convenient data access methods.
Definition: dataframe.hpp:38
void write_csv(const std::experimental::filesystem::path &file, const dataframe< DataTable > &df, char separator=',', char quote='"', char escape = '\)
Same as write_csv(std::ostream...), but write directly to a file.
Definition: csv.hpp:341
Parse and iterate over CSV formatted rows from an istream.
Definition: csv.hpp:57
dataframe read_csv(const std::experimental::filesystem::path &file, int drop=0, bool header=true, char separator=',', char quote='"', char escape = '\)
Same as read_csv() but read directly from a file.
Definition: csv.hpp:256
void header(std::vector< std::string > new_header)
Definition: dataframe.hpp:687
std::ostream & write_csv_row(std::ostream &out, Row &&row, char separator=',', char quote='"', char escape = '\)
Write a single csv row to an std::ostream.
Definition: csv.hpp:287