aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
rawDatabaseTable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The table containing the raw/original data of a database
24  *
25  * Class RawDatabaseTable is intended to store in RAM the raw/original data
26  * of a database. Such raw data are not well suited for learning tasks
27  * because they need to be interpreted by the learning algorithm, which would
28  * incur a strong overhead. However, reading a CSV file and interpreting its
29  * data in order to reshape them in a way that will allow fast parsing by
30  * learning algorithms is also very time consuming. So, if you are unsure
31  * about the correct interpretation and need to change it several times either
32  * before processing the learning or during several learning phases, it is
33  * efficient to first read the CSV file and store its useful data (removing
34  * comments, for instance) into a first database table and, then, use this
35  * preprocessed table to produce quickly the interpreted database table that
36  * will subsequently be used by the learning. The purpose of the
37  * RawDatabaseTable class is precisely to implement this preprocessed table.
38  *
39  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
40  */
41 #ifndef GUM_RAW_DATABASE_TABLE_H
42 #define GUM_RAW_DATABASE_TABLE_H
43 
44 
45 #include <agrum/agrum.h>
46 #include <agrum/tools/database/DBCell.h>
47 #include <agrum/tools/database/IDatabaseTable.h>
48 
49 namespace gum {
50 
51  namespace learning {
52 
53  /** @class RawDatabaseTable
54  * @brief The table containing the raw/original data of a database
55  * @headerfile rawDatabaseTable.h <agrum/tools/databaseTable.h>
56  * @ingroup learning_database
57  *
58  * Class RawDatabaseTable is intended to store in RAM the raw/original data
59  * of a database. Such raw data are not well suited for learning tasks
60  * because they need to be interpreted by the learning algorithm, which would
61  * incur a strong overhead. However, reading a CSV file and interpreting its
62  * data in order to reshape them in a way that will allow fast parsing by
63  * learning algorithms is also very time consuming. So, if you are unsure
64  * about the correct interpretation and need to change it several times either
65  * before processing the learning or during several learning phases, it is
66  * efficient to first read the CSV file and store its useful data (removing
67  * comment, for instance) into a first database table and, then, use this
68  * preprocessed table to produce quickly the interpreted database table that
69  * will subsequently be used by the learning. The purpose of the
70  * RawDatabaseTable class is precisely to implement this preprocessed table.
71  *
72  * @par Usage example:
73  * @code
74  * // create an empty database
75  * gum::learning::RawDatabaseTable<> database;
76  *
77  * // create a new row with 3 DBCells containing integer 2
78  * typename gum::learning::RawDatabaseTable<>::template
79  * Row<gum::learning::DBCell> new_row ( 3, gum::learning::DBCell ( 2 ) );
80  *
81  * // add it into the database
82  * database.insertRow ( new_row );
83  * database.insertRow ( std::move ( new_row ) );
84  *
85  * // erase the first Row
86  * database.eraseFirstRow ();
87  *
88  * // returns the content of the database
89  * const auto& content = database.content ();
90  *
91  * // sets the names of the variables (the columns) of the database
92  * std::vector<std::string> new_names { "col1", "col2", "col3" };
93  * database.setVariableNames ( new_names );
94  *
95  * // print the names of the columns
96  * std::cout << database.variableNames () << std::endl;
97  *
98  * // print all the records of the database
99  * for ( auto row : database )
100  * std::cout << row << std::endl;
101  *
102  * // make the handler parse the 3rd record to the 5th record (included)
103  * auto handler = database.handler ();
104  * handler.setRange ( 2, 5 ); // 2 = 3rd record; 5 = 6th record (excluded)
105  * while ( const auto& row : handler ) {
106  * std::cout << row << std::endl;
107  * }
108  *
109  * // clears the content of the database and update the safe database's
110  * // handlers
111  * database.clear ();
112  * @endcode
113  */
114  template < template < typename > class ALLOC = std::allocator >
116  public:
117  /// the type for the vectors used in the RawDatabaseTable
118  template < typename TX_DATA >
120 
121  /// a row of the database
122  template < typename TX_DATA >
123  using Row = DBRow< TX_DATA, ALLOC >;
124 
125  /// the type for the matrices stored into the database
126  template < typename TX_DATA >
127  using Matrix = DBVector< Row< TX_DATA > >;
128 
129  template < template < typename > class XALLOC >
131 
132  /// the unsafe handler type
133  using Handler = typename IDatabaseTable< DBCell, ALLOC >::Handler;
134 
135  /// the safe handler type
137 
139 
140  /// Types for STL compliance.
141  /// @{
142  using value_type = Row< DBCell >;
144  using const_reference = const value_type&;
145  using pointer = value_type*;
146  using const_pointer = const value_type*;
147  using size_type = std::size_t;
149  using iterator = Handler;
152  /// @}
153 
154 
155  // ##########################################################################
156  /// @name Constructors / Destructors
157  // ##########################################################################
158  /// @{
159 
160  /// default constructor
161  template < template < typename > class VARALLOC, template < typename > class MISSALLOC >
162  RawDatabaseTable(const MissingValType< MISSALLOC >& missing_symbols,
163  const std::vector< std::string, VARALLOC< std::string > >& var_names,
164  const allocator_type& alloc = allocator_type());
165 
166  /// default constructor
167  template < template < typename > class MISSALLOC >
168  RawDatabaseTable(const MissingValType< MISSALLOC >& missing_symbols,
169  const allocator_type& alloc = allocator_type());
170 
171  /// default constructor
172  RawDatabaseTable(const allocator_type& alloc = allocator_type());
173 
174  /// copy constructor
175  RawDatabaseTable(const RawDatabaseTable< ALLOC >& from);
176 
177  /// copy constructor with a given allocator
178  RawDatabaseTable(const RawDatabaseTable< ALLOC >& from, const allocator_type& alloc);
179 
180  /// move constructor
181  RawDatabaseTable(RawDatabaseTable< ALLOC >&& from);
182 
183  /// move constructor with a given allocator
184  RawDatabaseTable(RawDatabaseTable< ALLOC >&& from, const allocator_type& alloc);
185 
186  /// virtual copy constructor
187  virtual RawDatabaseTable< ALLOC >* clone() const final;
188 
189  /// virtual copy constructor with a given allocator
190  virtual RawDatabaseTable< ALLOC >* clone(const allocator_type& alloc) const final;
191 
192  /// destructor
193  virtual ~RawDatabaseTable();
194 
195  /// @}
196 
197 
198  // ##########################################################################
199  /// @name Operators
200  // ##########################################################################
201  /// @{
202 
203  /// copy operator
205 
206  /// move constructor
208 
209  /// @}
210 
211 
212  // ##########################################################################
213  /// @name Accessors / Modifiers
214  // ##########################################################################
215  /// @{
216 
218 
219  /// sets the names of the variables
220  /** This method can be called in two different ways: either the names
221  * correspond precisely to the columns stored into the database table
222  * (in this case, parameter from_external_object is equal to false),
223  * or they corresponds to the columns of an external database (e.g., a
224  * CSV file) from which we potentially excluded some columns and,
225  * consequently, these columns should not be taken into account (in this
226  * case, parameter from_external_object is equal to true). As an
227  * example, imagine that the database table is created from a CSV file
228  * with 5 columns named X0, X1, X2, X3 and X4 respectivly. Suppose that
229  * we asked the database table to ignore columns X1 and X3. Then
230  * setVariableNames( { "X0", "X1", "X2", "X3", "X4" }, true ) will
231  * set the columns of the database table as { "X0", "X2", "X4" }. The
232  * same result could be obtained by executing
233  * setVariableNames( { "X0", "X2", "X4" }, false ), which specifies
234  * directly the set of names to retain in the database table.
235  * @param names the names of all the columns, including the ignored
236  * columns if from_external_object is set to true, else excluding
237  * them (i.e., this should precisely correspond to the columns stored
238  * into the database table).
239  * @param from_external_object a Boolean indicating whether parameter
240  * names includes the columns ignored by the database table (true) or
241  * not (false).
242  * @throw SizeError is raised if the names passed in arguments cannot be
243  * assigned to the columns of the RawDatabaseTable because the size of their
244  * vector is inadequate. */
245  virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > >& names,
246  const bool from_external_object = true) final;
247 
248  /// makes the database table ignore from now on the kth column
249  /** This method can be called in two different ways: either k refers to
250  * the current kth column of the database table (in this case parameter
251  * from_external_object is set to false), or k corresponds to the kth
252  * column of an original database used to fill the database table
253  * (in this case from_external_object is set to true). Depending on
254  * from_external_object's value, the ignored columns may differ. As an
255  * example, imagine that the database table is created from a CSV file
256  * with 5 columns named X0, X1, X2, X3 and X4 respectivly. Then a call to
257  * ignoreColumn ( 1, true ) will exclude column X1 from the database table.
258  * As a result, the database table columns are X0, X2, X3 and X4.
259  * Therefore, subsequently calling ignoreColumn ( 1, false ) will result
260  * in excluding X2 since X2 is the 2nd column (columns are indexed
261  * starting from 0). So, now the database table's columns are
262  * X0, X3 and X4. If, now, we call ignoreColumn ( 3, true ), this will
263  * remove column X3 because, in the original database, X3 was the 4th
264  * column.
265  *
266  * @warning If the database table was not empty, then the kth column is
267  * removed from all the rows currently stored.
268  * @warning If the kth column does not exist (i.e., the original dataset
269  * does not contain the kth column when from_external_object is set to
270  * true, or the RawDatabaseTable has no kth column when from_external_object
271  * is set to false), column k is marked as to be ignored and nothing is
272  * done on the content of the RawDatabaseTable. No exception is raised.
273  * @param k the column to remove. See Method setVariableNames for a
274  * detailed description on how k is computed.
275  * @param from_external_object indicates whether k refers to the kth
276  * column of an original external database (true) or to the current kth
277  * column of the RawDatabaseTable. */
278  virtual void ignoreColumn(const std::size_t k, const bool from_external_object = true) final;
279 
280  /// returns the set of columns of the original dataset that are ignored
281  virtual const DBVector< std::size_t > ignoredColumns() const final;
282 
283  /** @brief returns the set of columns of the original dataset that are
284  * present in the RawDatabaseTable */
285  virtual const DBVector< std::size_t > inputColumns() const final;
286 
288 
289  /// insert a new row at the end of the database
290  /** The new_row passed in argument is supposed to come from an external
291  * database. So it must contain data for the ignored columns.
292  * @throw SizeError is raised if the vector of string cannot be inserted
293  * in the RawDatabaseTable because its size does not allow a matching with
294  * the columns of the RawDatabaseTable (taking into account the ignored
295  * columns) */
296  virtual void insertRow(const std::vector< std::string, ALLOC< std::string > >& new_row) final;
297 
298  /// erase the content of the database, including the names of the variables
299  virtual void clear() final;
300 
301  /// @}
302 
303 
304 #ifndef DOXYGEN_SHOULD_SKIP_THIS
305 
306  private:
307  // the set of ignored columns, sorted by increasing order
308  DBVector< std::size_t > _ignored_cols_;
309 
310  /// translates a string into a DBCell and returns it
311  DBCell _convert_(const std::string& elt) const;
312 
313 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
314  };
315 
316  } /* namespace learning */
317 
318 } /* namespace gum */
319 
320 /// always include the templated implementations
321 #include <agrum/tools/database/rawDatabaseTable_tpl.h>
322 
323 #endif /* GUM_RAW_DATABASE_TABLE_H */
RawDatabaseTable< ALLOC > & operator=(RawDatabaseTable< ALLOC > &&from)
move constructor
The table containing the raw/original data of a databaseClass RawDatabaseTable is intended to store i...
virtual void ignoreColumn(const std::size_t k, const bool from_external_object=true) final
makes the database table ignore from now on the kth column
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
RawDatabaseTable(RawDatabaseTable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual const DBVector< std::size_t > ignoredColumns() const final
returns the set of columns of the original dataset that are ignored
virtual const DBVector< std::size_t > inputColumns() const final
returns the set of columns of the original dataset that are present in the RawDatabaseTable ...
RawDatabaseTable(const RawDatabaseTable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
RawDatabaseTable(const MissingValType< MISSALLOC > &missing_symbols, const allocator_type &alloc=allocator_type())
default constructor
virtual RawDatabaseTable< ALLOC > * clone(const allocator_type &alloc) const final
virtual copy constructor with a given allocator
RawDatabaseTable(const RawDatabaseTable< ALLOC > &from)
copy constructor
RawDatabaseTable(RawDatabaseTable< ALLOC > &&from)
move constructor
virtual void insertRow(const std::vector< std::string, ALLOC< std::string > > &new_row) final
insert a new row at the end of the database
RawDatabaseTable(const MissingValType< MISSALLOC > &missing_symbols, const std::vector< std::string, VARALLOC< std::string > > &var_names, const allocator_type &alloc=allocator_type())
default constructor
virtual ~RawDatabaseTable()
destructor
virtual RawDatabaseTable< ALLOC > * clone() const final
virtual copy constructor
virtual void clear() final
erase the content of the database, including the names of the variables
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
RawDatabaseTable< ALLOC > & operator=(const RawDatabaseTable< ALLOC > &from)
copy operator
virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > > &names, const bool from_external_object=true) final
sets the names of the variables