aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBInitializerFromCSV.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class for initializing DatabaseTable and RawDatabaseTable
24  * instances from CSV files
25  *
26  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
27  */
28 #ifndef GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H
29 #define GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H
30 
31 #include <vector>
32 #include <string>
33 #include <fstream>
34 #include <iostream>
35 #include <sstream>
36 
37 #include <agrum/agrum.h>
38 #include <agrum/tools/database/IDBInitializer.h>
39 #include <agrum/tools/database/CSVParser.h>
40 
41 namespace gum {
42 
43  namespace learning {
44 
45  /** @class DBInitializerFromCSV
46  * @headerfile DBInitializerFromCSV.h <agrum/tools/database/DBInitializerFromCSV.h>
47  * @ingroup learning_database
48  * @brief The class for initializing DatabaseTable and RawDatabaseTable
49  * instances from CSV files
50  *
51  * In aGrUM, the usual way to create DatabaseTable instances used by learning
52  * algorithms is to use the 4-step process below:
53  * -# Create an IDBInitializer instance (either a DBInitializerFromCSV or a
54  * DBInitializerFromSQL). This will enable to get the variables
55  * corresponding to the columns of the DatabaseTable.
56  * -# Knowing these variables, create a DBTranslatorSet for encoding the
57  * lines of the CSV file or those of the SQL result into the appropriate
58  * values for the learning algorithms.
59  * -# Create the DatabaseTable, passing it the DBTranslatorSet created
60  * in the preceding step. Use the IDBInitializer to provide the variables'
61  * names to the DatabaseTable.
62  * -# Use the IDBInitializer to add the lines of the CSV file or those of the
63  * SQL result into the DatabaseTable.
64  *
65  * @par The following codes show the details of this process:
66  * @code
67  * // 1/ use the initializer to parse all the columns/rows of a CSV file
68  * gum::learning::DBInitializerFromCSV<> initializer ( "asia.csv" );
69  * const auto& var_names = initializer.variableNames ();
70  * const std::size_t nb_vars = var_names.size ();
71  *
72  * // we create as many translators as there are variables
73  * gum::learning::DBTranslator4LabelizedVariable<> translator;
74  * gum::learning::DBTranslatorSet<> translator_set;
75  * for ( std::size_t i = 0; i < nb_vars; ++i )
76  * translator_set.insertTranslator ( translator, i );
77 
78  * // create a DatabaseTable with these translators. For the moment, the
79  * // DatabaseTable will be empty, i.e., it will contain no row
80  * gum::learning::DatabaseTable<> database ( translator_set );
81  * database.setVariableNames( initializer.variableNames () );
82  *
83  * // use the DBInitializerFromCSV to fill the rows:
84  * initializer.fillDatabase ( database );
85  * // now, the database contains all the content of the CSV file
86  *
87  *
88  * // 2/ use an IDBInitializer to initialize a DatabaseTable, but ignore
89  * // some columns.
90  * gum::learning::DBInitializerFromCSV<> initializer2 ( "asia.csv" );
91  * gum::learning::DatabaseTable<> database2; // empty database
92  *
93  * // indicate which columns of the CSV file should be read
94  * database2.insertTranslator ( translator, 1 );
95  * database2.insertTranslator ( translator, 3 );
96  * database2.insertTranslator ( translator, 4 );
97  *
98  * // sets the names of the columns correctly
99  * database2.setVariableNames( initializer2.variableNames () );
100  *
101  * // fill the rows:
102  * initializer2.fillDatabase ( database2 );
103  * // now all the rows of the CSV file have been transferred into database2,
104  * // but only columns 1, 3 and 4 of the CSV file have been kept.
105  *
106  *
107  * // 3/ another possibility to initialize a DatabaseTable, ignoring
108  * // some columns:
109  * gum::learning::DBInitializerFromCSV<> initializer3 ( "asia.csv" );
110  * gum::learning::DatabaseTable<> database3 ( translator_set );
111  * // here, database3 is an empty database but it contains already
112  * // translators for all the columns of the CSV file. We shall now remove
113  * // the columns/translators that are not wanted anymore
114  * database3.ignoreColumn ( 0 );
115  * database3.ignoreColumn ( 2 );
116  * database3.ignoreColumn ( 5 );
117  * database3.ignoreColumn ( 6 );
118  * database3.ignoreColumn ( 7 );
119  * // asia contains 8 columns. The above ignoreColumns keep only columns
120  * // 1, 3 and 4.
121  *
122  * // sets the names of the columns correctly
123  * database3.setVariableNames( initializer3.variableNames () );
124  * // fill the rows:
125  * initializer3.fillDatabase ( database3 );
126  * // now all the rows of the CSV file have been transferred into database3,
127  * // but only columns 1, 3 and 4 of the CSV file have been kept.
128  * @endcode
129  */
130  template < template < typename > class ALLOC = std::allocator >
131  class DBInitializerFromCSV: public IDBInitializer< ALLOC > {
132  public:
133  /// type for the allocators passed in arguments of methods
134  using allocator_type = ALLOC< std::string >;
135 
136  // ##########################################################################
137  /// @name Constructors / Destructors
138  // ##########################################################################
139  /// @{
140 
141  /// default constructor
142  /** @param filename the name of the CSV file
143  * @param fileContainsNames a Boolean indicating whether the first line of
144  * the CSV file contains the names of the columns
145  * @param delimiter the character that acts as the column separator in
146  * the CSV file
147  * @param commentmarker the character that marks the beginning of a comment
148  * @param quoteMarker the character that is used to quote the sentences
149  * in the CSV file
150  * @param alloc the allocator used by all the methods
151  */
152  DBInitializerFromCSV(const std::string filename,
153  bool fileContainsNames = true,
154  const std::string delimiter = ",",
155  const char commentmarker = '#',
156  const char quoteMarker = '"',
157  const allocator_type& alloc = allocator_type());
158 
159  /// copy constructor
160  /** the new initializer points to the same file as from, but it reparses
161  * it from scratch. */
162  DBInitializerFromCSV(const DBInitializerFromCSV< ALLOC >& from);
163 
164  /// copy constructor with a given allocator
165  /** the new initializer points to the same file as from, but it reparses
166  * it from scratch. */
167  DBInitializerFromCSV(const DBInitializerFromCSV< ALLOC >& from, const allocator_type& alloc);
168 
169  /// move constructor
170  DBInitializerFromCSV(DBInitializerFromCSV< ALLOC >&& from);
171 
172  /// move constructor with a given allocator
173  DBInitializerFromCSV(DBInitializerFromCSV< ALLOC >&& from, const allocator_type& alloc);
174 
175  /// virtual copy constructor
176  virtual DBInitializerFromCSV< ALLOC >* clone() const;
177 
178  /// virtual copy constructor with a given allocator
179  virtual DBInitializerFromCSV< ALLOC >* clone(const allocator_type& alloc) const;
180 
181  /// destructor
182  virtual ~DBInitializerFromCSV();
183 
184  /// @}
185 
186 
187  // ##########################################################################
188  /// @name Operators
189  // ##########################################################################
190 
191  /// @{
192 
193  /// copy operator
194  /** the initializer points to the same file as from, but it reparses
195  * it from scratch. */
196  DBInitializerFromCSV< ALLOC >& operator=(const DBInitializerFromCSV< ALLOC >& from);
197 
198  /// move operator
199  /** the initializer points to the same file as from, but it reparses
200  * it from scratch. */
201  DBInitializerFromCSV< ALLOC >& operator=(DBInitializerFromCSV< ALLOC >&& from);
202 
203  /// @}
204 
205 
206  protected:
207  /// returns the names of the variables
208  virtual std::vector< std::string, ALLOC< std::string > > variableNames_() final;
209 
210  /// returns the content of the current row using strings
211  virtual const std::vector< std::string, ALLOC< std::string > >& currentStringRow_() final;
212 
213  /// indicates whether there is a next row to read (and point on it)
214  virtual bool nextRow_() final;
215 
216 
217 #ifndef DOXYGEN_SHOULD_SKIP_THIS
218 
219  private:
220  // the filename used for parsing
221  std::string _filename_;
222 
223  // indicates the delimiter used within the CSV
224  std::string _delimiter_;
225 
226  // indicates which character is a comment symbol in the CSV
227  char _comment_marker_;
228 
229  // indicates which character is a quote symbol in the CSV
230  char _quote_marker_;
231 
232  // indicates whether the first row of the file contains the names
233  bool _first_row_has_names_;
234 
235  // the input stream read by the parser
236  std::ifstream _input_stream_;
237 
238  // the CSV parser used for the reading the CSV file
239  CSVParser< ALLOC > _parser_;
240 
241  // the variables names, if the first row has names
242  std::vector< std::string, ALLOC< std::string > > _var_names_;
243 
244 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
245  };
246 
247  } /* namespace learning */
248 
249 } /* namespace gum */
250 
251 // always include the template implementation
252 #include <agrum/tools/database/DBInitializerFromCSV_tpl.h>
253 
254 
255 #endif /* GUM_LEARNING_DB_INITILIALIZER_FROM_CSV_H */