aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
CSVParser_tpl.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (c) 2005-2020 by Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6) *
3  * info_at_agrum_dot_org *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 2 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
20 /** @file
21  * @brief Class for fast parsing of CSV file (never more than one line in
22  * application memory)
23  *
24  * @author Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
25  *
26  */
27 #include <agrum/tools/database/CSVParser.h>
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 
31 namespace gum {
32 
33  namespace learning {
34 
35  /// default constructor
36  template <template<typename> class ALLOC>
37  CSVParser<ALLOC>::CSVParser(
38  std::istream& instream,
39  const std::string& delimiter,
40  const char commentmarker,
41  const char quoteMarker,
42  const typename CSVParser<ALLOC>::allocator_type& alloc )
43  : _line_()
44  , _delimiter_( delimiter )
45  , _spaces_( " \t\r" )
46  , _delimiterPlusSpaces_( _delimiter_ + _spaces_ )
47  , _nbLine_( std::size_t(0) )
48  , _commentMarker_( commentmarker )
49  , _quoteMarker_( quoteMarker )
50  , _emptyData_( true )
51  , _instream_( &instream )
52  , _data_( alloc ) {
53  GUM_CONSTRUCTOR( CSVParser );
54  }
55 
56 
57  /// destructor
58  template <template<typename> class ALLOC>
61  }
62 
63  template <template<typename> class ALLOC>
64  void CSVParser<ALLOC>:: _getNextTriplet_( const std::string& str,
68  std::size_t from ) const {
70 
71  if ( first_letter_token == std::string::npos ) {
73  return;
74  }
75 
76  if ( str.at( first_letter_token ) == _quoteMarker_ ) {
78 
79  if ( last_letter_token == std::string::npos )
80  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), first_letter_token );
81 
85 
86  if ( next_char < next_token ) {
87  GUM_SYNTAX_ERROR( "Delimiter missing at line", (Size) nbLine(), next_char );
88  }
89  }
90  else {
92 
93  if ( next_token == std::string::npos ) {
95  }
96  else if ( next_token == first_letter_token ) {
98  }
99  else {
102  }
103  }
104  }
105 
106 
107  template <template<typename> class ALLOC>
108  void CSVParser<ALLOC>:: _tokenize_( const std::string& s ) {
109  // looking for first commentMarker not in a string
113 
114  while ( quoteMarker < commentMarker ) {
116 
117  if ( quoteMarkerEnd == std::string::npos )
118  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), quoteMarker );
119 
120  while ( commentMarker < quoteMarkerEnd ) { // the comment was in the quote
122  }
123 
125  }
126 
127  std::string str = s.substr( 0, commentMarker );
128 
130 
133 
134  while ( ( std::string::npos != first_letter_token ) &&
135  ( std::string::npos != last_letter_token ) ) {
136  if ( _data_.size() <= counter ) _data_.resize( counter + 1 );
137 
138  if ( first_letter_token == next_token ) {
139  _data_[counter] = "";
140  }
141  else if ( last_letter_token >= first_letter_token ) {
142  const std::size_t fieldlength =
146  }
147  else {
148  _data_[counter] = "";
149  }
150 
151  counter++;
152 
153  if ( next_token == std::string::npos ) break;
154 
157  next_token,
159  next_token + 1 );
160  }
161 
162  // case where we end up with an empty field ...
163  if ( ( first_letter_token == std::string::npos ) &&
165  ( next_token == first_letter_token ) ) {
166  counter++;
167  _data_.resize( counter );
168  _data_[counter - 1] = "";
169  }
170  else {
171  _data_.resize( counter );
172  }
173 
174  _emptyData_ = false;
175  }
176 
177 
178  /// reopens a new input stream to parse
179  template <template<typename> class ALLOC>
181  const std::string& delimiter,
182  const char commentmarker,
183  const char quoteMarker ) {
184  _line_.clear ();
186  _spaces_ = " \t\r";
188  _nbLine_ = std::size_t(0);
191  _emptyData_ = true;
192  _instream_ = &instream;
193  _data_.clear ();
194  }
195 
196 
197  // gets the next line of the csv stream and parses it
198  template <template<typename> class ALLOC>
199  INLINE bool CSVParser<ALLOC>::next () {
200  while ( getline( * _instream_, _line_ ) ) {
201  _nbLine_++;
202 
203  if ( _line_.size() == std::size_t (0) ) continue;
204 
205  // fast recognition of commented or empty lines lines
206  std::size_t lastPos =
208 
209  if ( lastPos == std::string::npos ) continue;
210 
211  if ( _line_.at( lastPos ) == _commentMarker_ ) continue;
212 
213  _tokenize_( _line_ );
214  return true;
215  }
216 
217  return false;
218  }
219 
220 
221  // search for quote taking into account the '\'...
222  template <template<typename> class ALLOC>
223  INLINE std::size_t
225  std::size_t pos ) const {
226  std::size_t res = pos, before;
227 
228  while ( true ) {
230 
231  if ( res == std::string::npos ) return res; // no quote found
232 
233  before = str.find_last_not_of( '\\', res - 1 );
234 
235  if ( before == std::string::npos )
236  return res; // quote found, it is the good one
237 
238  if ( ( res - before ) % 2 == 1 )
239  return res; // the quote is the good one, even if there are some '\'
240  // before
241  }
242  }
243 
244 
245  // returns the current parsed line
246  template <template<typename> class ALLOC>
247  INLINE const std::vector<std::string,ALLOC<std::string>>&
248  CSVParser<ALLOC>::current () const {
249  if ( _emptyData_ )
250  GUM_ERROR( NullElement, "No parsed data" )
251 
252  return _data_;
253  }
254 
255 
256  // returns the current nbLine of parser line
257  template <template<typename> class ALLOC>
258  INLINE const std::size_t CSVParser<ALLOC>::nbLine() const {
259  if ( _nbLine_ == 0 )
260  GUM_ERROR( NullElement, "No parsed data" )
261 
262  return _nbLine_;
263  }
264 
265  } // namespace learning
266 
267 } // namespace gum
268 
269 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)