aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
CSVParser_tpl.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2005-2020 by Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6) *
3  * info_at_agrum_dot_org *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 2 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
20 /** @file
21  * @brief Class for fast parsing of CSV file (never more than one line in
22  * application memory)
23  *
24  * @author Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
25  *
26  */
27 #include <agrum/tools/database/CSVParser.h>
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 
31 namespace gum {
32 
33  namespace learning {
34 
35  /// default constructor
36  template <template<typename> class ALLOC>
37  CSVParser<ALLOC>::CSVParser(
38  std::istream& instream,
39  const std::string& delimiter,
40  const char commentmarker,
41  const char quoteMarker,
42  const typename CSVParser<ALLOC>::allocator_type& alloc )
43  : line__()
44  , delimiter__( delimiter )
45  , spaces__( " \t\r" )
46  , delimiterPlusSpaces__( delimiter__ + spaces__ )
47  , nbLine__( std::size_t(0) )
48  , commentMarker__( commentmarker )
49  , quoteMarker__( quoteMarker )
50  , emptyData__( true )
51  , instream__( &instream )
52  , data__( alloc ) {
53  GUM_CONSTRUCTOR( CSVParser );
54  }
55 
56 
57  /// destructor
58  template <template<typename> class ALLOC>
61  }
62 
63 
64  // for debugging purpose
65  /*
66  char getTheChar(const std::string& str,Size pos) {
67  return (pos<Size(std::string::npos))?str.at(pos):'$';
68  }
69  */
70 
71 
72  template <template<typename> class ALLOC>
73  void CSVParser<ALLOC>::getNextTriplet__( const std::string& str,
77  std::size_t from ) const {
79 
80  if ( first_letter_token == std::string::npos ) {
82  return;
83  }
84 
85  if ( str.at( first_letter_token ) == quoteMarker__ ) {
87 
88  if ( last_letter_token == std::string::npos )
89  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), first_letter_token );
90 
94 
95  if ( next_char < next_token ) {
96  GUM_SYNTAX_ERROR( "Delimiter missing at line", (Size) nbLine(), next_char );
97  }
98  }
99  else {
101 
102  if ( next_token == std::string::npos ) {
104  }
105  else if ( next_token == first_letter_token ) {
107  }
108  else {
111  }
112  }
113  }
114 
115 
116  template <template<typename> class ALLOC>
117  void CSVParser<ALLOC>::tokenize__( const std::string& s ) {
118  // looking for first commentMarker not in a string
122 
123  while ( quoteMarker < commentMarker ) {
125 
126  if ( quoteMarkerEnd == std::string::npos )
127  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), quoteMarker );
128 
129  while ( commentMarker < quoteMarkerEnd ) { // the comment was in the quote
131  }
132 
134  }
135 
136  std::string str = s.substr( 0, commentMarker );
137 
139 
142 
143  while ( ( std::string::npos != first_letter_token ) &&
144  ( std::string::npos != last_letter_token ) ) {
145  if ( data__.size() <= counter ) data__.resize( counter + 1 );
146 
147  if ( first_letter_token == next_token ) {
148  data__[counter] = "";
149  }
150  else if ( last_letter_token >= first_letter_token ) {
151  const std::size_t fieldlength =
155  }
156  else {
157  data__[counter] = "";
158  }
159 
160  counter++;
161 
162  if ( next_token == std::string::npos ) break;
163 
166  next_token,
168  next_token + 1 );
169  }
170 
171  // case where we end up with an empty field ...
172  if ( ( first_letter_token == std::string::npos ) &&
174  ( next_token == first_letter_token ) ) {
175  counter++;
176  data__.resize( counter );
177  data__[counter - 1] = "";
178  }
179  else {
180  data__.resize( counter );
181  }
182 
183  emptyData__ = false;
184  }
185 
186 
187  /// reopens a new input stream to parse
188  template <template<typename> class ALLOC>
190  const std::string& delimiter,
191  const char commentmarker,
192  const char quoteMarker ) {
193  line__.clear ();
195  spaces__ = " \t\r";
197  nbLine__ = std::size_t(0);
200  emptyData__ = true;
201  instream__ = &instream;
202  data__.clear ();
203  }
204 
205 
206  // gets the next line of the csv stream and parses it
207  template <template<typename> class ALLOC>
208  INLINE bool CSVParser<ALLOC>::next () {
209  while ( getline( *instream__, line__ ) ) {
210  nbLine__++;
211 
212  if ( line__.size() == std::size_t (0) ) continue;
213 
214  // fast recognition of commented or empty lines lines
215  std::size_t lastPos =
217 
218  if ( lastPos == std::string::npos ) continue;
219 
220  if ( line__.at( lastPos ) == commentMarker__ ) continue;
221 
222  tokenize__( line__ );
223  return true;
224  }
225 
226  return false;
227  }
228 
229 
230  // search for quote taking into account the '\'...
231  template <template<typename> class ALLOC>
232  INLINE std::size_t
234  std::size_t pos ) const {
235  std::size_t res = pos, before;
236 
237  while ( true ) {
239 
240  if ( res == std::string::npos ) return res; // no quote found
241 
242  before = str.find_last_not_of( '\\', res - 1 );
243 
244  if ( before == std::string::npos )
245  return res; // quote found, it is the good one
246 
247  if ( ( res - before ) % 2 == 1 )
248  return res; // the quote is the good one, even if there are some '\'
249  // before
250  }
251  }
252 
253 
254  // returns the current parsed line
255  template <template<typename> class ALLOC>
256  INLINE const std::vector<std::string,ALLOC<std::string>>&
257  CSVParser<ALLOC>::current () const {
258  if ( emptyData__ )
259  GUM_ERROR( NullElement, "No parsed data" );
260 
261  return data__;
262  }
263 
264 
265  // returns the current nbLine of parser line
266  template <template<typename> class ALLOC>
267  INLINE const std::size_t CSVParser<ALLOC>::nbLine() const {
268  if ( nbLine__ == 0 )
269  GUM_ERROR( NullElement, "No parsed data" );
270 
271  return nbLine__;
272  }
273 
274  } // namespace learning
275 
276 } // namespace gum
277 
278 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)