aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBInitializerFromCSV.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class for initializing DatabaseTable and RawDatabaseTable
24  * instances from CSV files
25  *
26  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
27  */
28 #ifndef GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H
29 #define GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H
30 
31 #include <vector>
32 #include <string>
33 #include <fstream>
34 #include <iostream>
35 #include <sstream>
36 
37 #include <agrum/agrum.h>
38 #include <agrum/tools/database/IDBInitializer.h>
39 #include <agrum/tools/database/CSVParser.h>
40 
41 namespace gum {
42 
43  namespace learning {
44 
45  /** @class DBInitializerFromCSV
46  * @headerfile DBInitializerFromCSV.h <agrum/tools/database/DBInitializerFromCSV.h>
47  * @ingroup learning_database
48  * @brief The class for initializing DatabaseTable and RawDatabaseTable
49  * instances from CSV files
50  *
51  * In aGrUM, the usual way to create DatabaseTable instances used by learning
52  * algorithms is to use the 4-step process below:
53  * -# Create an IDBInitializer instance (either a DBInitializerFromCSV or a
54  * DBInitializerFromSQL). This will enable to get the variables
55  * corresponding to the columns of the DatabaseTable.
56  * -# Knowing these variables, create a DBTranslatorSet for encoding the
57  * lines of the CSV file or those of the SQL result into the appropriate
58  * values for the learning algorithms.
59  * -# Create the DatabaseTable, passing it the DBTranslatorSet created
60  * in the preceding step. Use the IDBInitializer to provide the variables'
61  * names to the DatabaseTable.
62  * -# Use the IDBInitializer to add the lines of the CSV file or those of the
63  * SQL result into the DatabaseTable.
64  *
65  * @par The following codes show the details of this process:
66  * @code
67  * // 1/ use the initializer to parse all the columns/rows of a CSV file
68  * gum::learning::DBInitializerFromCSV<> initializer ( "asia.csv" );
69  * const auto& var_names = initializer.variableNames ();
70  * const std::size_t nb_vars = var_names.size ();
71  *
72  * // we create as many translators as there are variables
73  * gum::learning::DBTranslator4LabelizedVariable<> translator;
74  * gum::learning::DBTranslatorSet<> translator_set;
75  * for ( std::size_t i = 0; i < nb_vars; ++i )
76  * translator_set.insertTranslator ( translator, i );
77 
78  * // create a DatabaseTable with these translators. For the moment, the
79  * // DatabaseTable will be empty, i.e., it will contain no row
80  * gum::learning::DatabaseTable<> database ( translator_set );
81  * database.setVariableNames( initializer.variableNames () );
82  *
83  * // use the DBInitializerFromCSV to fill the rows:
84  * initializer.fillDatabase ( database );
85  * // now, the database contains all the content of the CSV file
86  *
87  *
88  * // 2/ use an IDBInitializer to initialize a DatabaseTable, but ignore
89  * // some columns.
90  * gum::learning::DBInitializerFromCSV<> initializer2 ( "asia.csv" );
91  * gum::learning::DatabaseTable<> database2; // empty database
92  *
93  * // indicate which columns of the CSV file should be read
94  * database2.insertTranslator ( translator, 1 );
95  * database2.insertTranslator ( translator, 3 );
96  * database2.insertTranslator ( translator, 4 );
97  *
98  * // sets the names of the columns correctly
99  * database2.setVariableNames( initializer2.variableNames () );
100  *
101  * // fill the rows:
102  * initializer2.fillDatabase ( database2 );
103  * // now all the rows of the CSV file have been transferred into database2,
104  * // but only columns 1, 3 and 4 of the CSV file have been kept.
105  *
106  *
107  * // 3/ another possibility to initialize a DatabaseTable, ignoring
108  * // some columns:
109  * gum::learning::DBInitializerFromCSV<> initializer3 ( "asia.csv" );
110  * gum::learning::DatabaseTable<> database3 ( translator_set );
111  * // here, database3 is an empty database but it contains already
112  * // translators for all the columns of the CSV file. We shall now remove
113  * // the columns/translators that are not wanted anymore
114  * database3.ignoreColumn ( 0 );
115  * database3.ignoreColumn ( 2 );
116  * database3.ignoreColumn ( 5 );
117  * database3.ignoreColumn ( 6 );
118  * database3.ignoreColumn ( 7 );
119  * // asia contains 8 columns. The above ignoreColumns keep only columns
120  * // 1, 3 and 4.
121  *
122  * // sets the names of the columns correctly
123  * database3.setVariableNames( initializer3.variableNames () );
124  * // fill the rows:
125  * initializer3.fillDatabase ( database3 );
126  * // now all the rows of the CSV file have been transferred into database3,
127  * // but only columns 1, 3 and 4 of the CSV file have been kept.
128  * @endcode
129  */
130  template < template < typename > class ALLOC = std::allocator >
131  class DBInitializerFromCSV: public IDBInitializer< ALLOC > {
132  public:
133  /// type for the allocators passed in arguments of methods
134  using allocator_type = ALLOC< std::string >;
135 
136  // ##########################################################################
137  /// @name Constructors / Destructors
138  // ##########################################################################
139  /// @{
140 
141  /// default constructor
142  /** @param filename the name of the CSV file
143  * @param fileContainsNames a Boolean indicating whether the first line of
144  * the CSV file contains the names of the columns
145  * @param delimiter the character that acts as the column separator in
146  * the CSV file
147  * @param commentmarker the character that marks the beginning of a comment
148  * @param quoteMarker the character that is used to quote the sentences
149  * in the CSV file
150  * @param alloc the allocator used by all the methods
151  */
152  DBInitializerFromCSV(const std::string filename,
153  bool fileContainsNames = true,
154  const std::string delimiter = ",",
155  const char commentmarker = '#',
156  const char quoteMarker = '"',
157  const allocator_type& alloc = allocator_type());
158 
159  /// copy constructor
160  /** the new initializer points to the same file as from, but it reparses
161  * it from scratch. */
162  DBInitializerFromCSV(const DBInitializerFromCSV< ALLOC >& from);
163 
164  /// copy constructor with a given allocator
165  /** the new initializer points to the same file as from, but it reparses
166  * it from scratch. */
167  DBInitializerFromCSV(const DBInitializerFromCSV< ALLOC >& from,
168  const allocator_type& alloc);
169 
170  /// move constructor
171  DBInitializerFromCSV(DBInitializerFromCSV< ALLOC >&& from);
172 
173  /// move constructor with a given allocator
174  DBInitializerFromCSV(DBInitializerFromCSV< ALLOC >&& from,
175  const allocator_type& alloc);
176 
177  /// virtual copy constructor
178  virtual DBInitializerFromCSV< ALLOC >* clone() const;
179 
180  /// virtual copy constructor with a given allocator
181  virtual DBInitializerFromCSV< ALLOC >*
182  clone(const allocator_type& alloc) const;
183 
184  /// destructor
185  virtual ~DBInitializerFromCSV();
186 
187  /// @}
188 
189 
190  // ##########################################################################
191  /// @name Operators
192  // ##########################################################################
193 
194  /// @{
195 
196  /// copy operator
197  /** the initializer points to the same file as from, but it reparses
198  * it from scratch. */
199  DBInitializerFromCSV< ALLOC >&
200  operator=(const DBInitializerFromCSV< ALLOC >& from);
201 
202  /// move operator
203  /** the initializer points to the same file as from, but it reparses
204  * it from scratch. */
205  DBInitializerFromCSV< ALLOC >&
206  operator=(DBInitializerFromCSV< ALLOC >&& from);
207 
208  /// @}
209 
210 
211  protected:
212  /// returns the names of the variables
213  virtual std::vector< std::string, ALLOC< std::string > >
214  variableNames_() final;
215 
216  /// returns the content of the current row using strings
217  virtual const std::vector< std::string, ALLOC< std::string > >&
218  currentStringRow_() final;
219 
220  /// indicates whether there is a next row to read (and point on it)
221  virtual bool nextRow_() final;
222 
223 
224 #ifndef DOXYGEN_SHOULD_SKIP_THIS
225 
226  private:
227  // the filename used for parsing
228  std::string filename__;
229 
230  // indicates the delimiter used within the CSV
231  std::string delimiter__;
232 
233  // indicates which character is a comment symbol in the CSV
234  char comment_marker__;
235 
236  // indicates which character is a quote symbol in the CSV
237  char quote_marker__;
238 
239  // indicates whether the first row of the file contains the names
240  bool first_row_has_names__;
241 
242  // the input stream read by the parser
243  std::ifstream input_stream__;
244 
245  // the CSV parser used for the reading the CSV file
246  CSVParser< ALLOC > parser__;
247 
248  // the variables names, if the first row has names
249  std::vector< std::string, ALLOC< std::string > > var_names__;
250 
251 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
252  };
253 
254  } /* namespace learning */
255 
256 } /* namespace gum */
257 
258 // always include the template implementation
259 #include <agrum/tools/database/DBInitializerFromCSV_tpl.h>
260 
261 
262 #endif /* GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H */