aGrUM  0.16.0
CSVParser_tpl.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2005 by Christophe GONZALES and Pierre-Henri WUILLEMIN *
3  * {prenom.nom}_at_lip6.fr *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 2 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
28 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 
31 namespace gum {
32 
33  namespace learning {
34 
36  template <template<typename> class ALLOC>
38  std::istream& instream,
39  const std::string& delimiter,
40  const char commentmarker,
41  const char quoteMarker,
42  const typename CSVParser<ALLOC>::allocator_type& alloc )
43  : __line()
44  , __delimiter( delimiter )
45  , __spaces( " \t\r" )
46  , __delimiterPlusSpaces( __delimiter + __spaces )
47  , __nbLine( std::size_t(0) )
48  , __commentMarker( commentmarker )
49  , __quoteMarker( quoteMarker )
50  , __emptyData( true )
51  , __instream( &instream )
52  , __data( alloc ) {
53  GUM_CONSTRUCTOR( CSVParser );
54  }
55 
56 
58  template <template<typename> class ALLOC>
60  GUM_DESTRUCTOR( CSVParser );
61  }
62 
63 
64  // for debugging purpose
65  /*
66  char getTheChar(const std::string& str,Size pos) {
67  return (pos<Size(std::string::npos))?str.at(pos):'$';
68  }
69  */
70 
71 
72  template <template<typename> class ALLOC>
73  void CSVParser<ALLOC>::__getNextTriplet( const std::string& str,
74  std::size_t& first_letter_token,
75  std::size_t& next_token,
76  std::size_t& last_letter_token,
77  std::size_t from ) const {
78  first_letter_token = str.find_first_not_of( __spaces, from );
79 
80  if ( first_letter_token == std::string::npos ) {
81  next_token = last_letter_token = first_letter_token;
82  return;
83  }
84 
85  if ( str.at( first_letter_token ) == __quoteMarker ) {
86  last_letter_token = __correspondingQuoteMarker( str, first_letter_token );
87 
88  if ( last_letter_token == std::string::npos )
89  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), first_letter_token );
90 
91  next_token = str.find_first_of( __delimiter, last_letter_token + 1 );
92  std::size_t next_char =
93  str.find_first_not_of( __spaces, last_letter_token + 1 );
94 
95  if ( next_char < next_token ) {
96  GUM_SYNTAX_ERROR( "Delimiter missing at line", (Size) nbLine(), next_char );
97  }
98  }
99  else {
100  next_token = str.find_first_of( __delimiter, first_letter_token );
101 
102  if ( next_token == std::string::npos ) {
103  last_letter_token = str.find_last_not_of( __spaces, next_token );
104  }
105  else if ( next_token == first_letter_token ) {
106  last_letter_token = first_letter_token;
107  }
108  else {
109  last_letter_token =
110  str.find_last_not_of( __delimiterPlusSpaces, next_token - 1 );
111  }
112  }
113  }
114 
115 
116  template <template<typename> class ALLOC>
117  void CSVParser<ALLOC>::__tokenize( const std::string& s ) {
118  // looking for first commentMarker not in a string
119  std::size_t commentMarker = s.find_first_of( __commentMarker, 0 );
120  std::size_t quoteMarker = s.find_first_of( __quoteMarker, 0 );
121  std::size_t quoteMarkerEnd;
122 
123  while ( quoteMarker < commentMarker ) {
124  quoteMarkerEnd = __correspondingQuoteMarker( s, quoteMarker );
125 
126  if ( quoteMarkerEnd == std::string::npos )
127  GUM_SYNTAX_ERROR( "String quote missing", (Size) nbLine(), quoteMarker );
128 
129  while ( commentMarker < quoteMarkerEnd ) { // the comment was in the quote
130  commentMarker = s.find_first_of( __commentMarker, commentMarker + 1 );
131  }
132 
133  quoteMarker = s.find_first_of( __quoteMarker, quoteMarkerEnd + 1 );
134  }
135 
136  std::string str = s.substr( 0, commentMarker );
137 
138  std::size_t counter = 0, first_letter_token, next_token, last_letter_token;
139 
140  __getNextTriplet(
141  str, first_letter_token, next_token, last_letter_token, 0 );
142 
143  while ( ( std::string::npos != first_letter_token ) &&
144  ( std::string::npos != last_letter_token ) ) {
145  if ( __data.size() <= counter ) __data.resize( counter + 1 );
146 
147  if ( first_letter_token == next_token ) {
148  __data[counter] = "";
149  }
150  else if ( last_letter_token >= first_letter_token ) {
151  const std::size_t fieldlength =
152  last_letter_token + 1 - first_letter_token;
153  __data[counter].resize( fieldlength );
154  __data[counter].assign( str, first_letter_token, fieldlength );
155  }
156  else {
157  __data[counter] = "";
158  }
159 
160  counter++;
161 
162  if ( next_token == std::string::npos ) break;
163 
164  __getNextTriplet( str,
165  first_letter_token,
166  next_token,
167  last_letter_token,
168  next_token + 1 );
169  }
170 
171  // case where we end up with an empty field ...
172  if ( ( first_letter_token == std::string::npos ) &&
173  ( last_letter_token == first_letter_token ) &&
174  ( next_token == first_letter_token ) ) {
175  counter++;
176  __data.resize( counter );
177  __data[counter - 1] = "";
178  }
179  else {
180  __data.resize( counter );
181  }
182 
183  __emptyData = false;
184  }
185 
186 
188  template <template<typename> class ALLOC>
189  void CSVParser<ALLOC>::useNewStream (std::istream& instream,
190  const std::string& delimiter,
191  const char commentmarker,
192  const char quoteMarker ) {
193  __line.clear ();
194  __delimiter = delimiter;
195  __spaces = " \t\r";
196  __delimiterPlusSpaces = __delimiter + __spaces;
197  __nbLine = std::size_t(0);
198  __commentMarker = commentmarker;
199  __quoteMarker = quoteMarker;
200  __emptyData = true;
201  __instream = &instream;
202  __data.clear ();
203  }
204 
205 
206  // gets the next line of the csv stream and parses it
207  template <template<typename> class ALLOC>
208  INLINE bool CSVParser<ALLOC>::next () {
209  while ( getline( *__instream, __line ) ) {
210  __nbLine++;
211 
212  if ( __line.size() == std::size_t (0) ) continue;
213 
214  // fast recognition of commented or empty lines lines
215  std::size_t lastPos =
216  __line.find_first_not_of( __spaces, std::size_t(0) );
217 
218  if ( lastPos == std::string::npos ) continue;
219 
220  if ( __line.at( lastPos ) == __commentMarker ) continue;
221 
222  __tokenize( __line );
223  return true;
224  }
225 
226  return false;
227  }
228 
229 
230  // search for quote taking into account the '\'...
231  template <template<typename> class ALLOC>
232  INLINE std::size_t
233  CSVParser<ALLOC>::__correspondingQuoteMarker( const std::string& str,
234  std::size_t pos ) const {
235  std::size_t res = pos, before;
236 
237  while ( true ) {
238  res = str.find_first_of( __quoteMarker, res + 1 );
239 
240  if ( res == std::string::npos ) return res; // no quote found
241 
242  before = str.find_last_not_of( '\\', res - 1 );
243 
244  if ( before == std::string::npos )
245  return res; // quote found, it is the good one
246 
247  if ( ( res - before ) % 2 == 1 )
248  return res; // the quote is the good one, even if there are some '\'
249  // before
250  }
251  }
252 
253 
254  // returns the current parsed line
255  template <template<typename> class ALLOC>
256  INLINE const std::vector<std::string,ALLOC<std::string>>&
257  CSVParser<ALLOC>::current () const {
258  if ( __emptyData )
259  GUM_ERROR( NullElement, "No parsed data" );
260 
261  return __data;
262  }
263 
264 
265  // returns the current nbLine of parser line
266  template <template<typename> class ALLOC>
267  INLINE const std::size_t CSVParser<ALLOC>::nbLine() const {
268  if ( __nbLine == 0 )
269  GUM_ERROR( NullElement, "No parsed data" );
270 
271  return __nbLine;
272  }
273 
274  } // namespace learning
275 
276 } // namespace gum
277 
278 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
void useNewStream(std::istream &in, const std::string &delimiter=",", const char commentmarker='#', const char quoteMarker='"' )
reopens a new input stream to parse
const std::size_t nbLine() const
returns the current line number within the stream
ALLOC< std::string > allocator_type
type for the allocators passed in arguments of methods
Definition: CSVParser.h:82
#define GUM_SYNTAX_ERROR(msg, line, column)
Definition: exceptions.h:94
STL namespace.
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
Definition: agrum.h:25
CSVParser(std::istream &in, const std::string &delimiter=",", const char commentmarker='#', const char quoteMarker='"', const allocator_type& alloc = allocator_type () )
default constructor
bool next()
gets the next line of the csv stream and parses it
virtual ~CSVParser()
destructor
const std::vector< std::string, ALLOC< std::string > > & current() const
returns the current parsed line
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Definition: types.h:48
#define GUM_ERROR(type, msg)
Definition: exceptions.h:55
Class for fast parsing of CSV file (never more than one line in application memory) ...