aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
databaseTable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class representing a tabular database stored in RAM
24  *
25  * Class DatabaseTable represents a tabular database that stores in the
26  * computer's random access memory (RAM) its content as a vector of DBRows
27  * of DBTranslatedValue instances.
28  *
29  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
30  */
31 #ifndef GUM_DATABASE_TABLE_H
32 #define GUM_DATABASE_TABLE_H
33 
34 #include <numeric>
35 #include <algorithm>
36 #include <functional>
37 #include <exception>
38 #include <vector>
39 
40 #include <agrum/agrum.h>
41 #include <agrum/tools/core/math/math_utils.h>
42 #include <agrum/tools/core/set.h>
43 #include <agrum/tools/core/thread.h>
44 #include <agrum/tools/database/DBCell.h>
45 #include <agrum/tools/database/DBRow.h>
46 #include <agrum/tools/database/DBTranslatedValue.h>
47 #include <agrum/tools/database/IDatabaseTable.h>
48 #include <agrum/tools/database/DBTranslatorSet.h>
49 #include <agrum/tools/database/DBTranslator4ContinuousVariable.h>
50 
51 namespace gum {
52 
53  namespace learning {
54 
55  /** @class DatabaseTable
56  * @brief The class representing a tabular database as used by learning tasks.
57  * @headerfile databaseTable.h <agrum/tools/database/databaseTable.h>
58  * @ingroup learning_database
59  *
60  * Class DatabaseTable represents a tabular database that stores in the
61  * computer's random access memory (RAM) its content as a vector of DBRows
62  * of DBTranslatedValue instances. This class is very well suited for fast
63  * learning algorithms.
64  *
65  * @par Usage example:
66  * @code
67  * // create the database from a CSV. This is not compulsory for
68  * // DatabaseTable instances, but this is how we usually create
69  * // DatabaseTable instances
70  * gum::learning::DBInitializerFromCSV<> initializer ( "asia.csv" );
71  * const auto& var_names = initializer.variableNames ();
72  * gum::learning::DBTranslatorSet<> translator_set;
73  * gum::learning::DBTranslator4LabelizedVariable<> translator;
74  * for ( std::size_t i = 0; i < var_names.size(); ++i )
75  * translator_set.insertTranslator ( translator, i );
76  * gum::learning::DatabaseTable<> database ( translator_set );
77  * database.setVariableNames( initializer.variableNames () );
78  *
79  * // here, database contains the content of the asia.csv file.
80  * // determine how many columns and rows the database contains
81  * std::size_t nb_rows = database.nbRows();
82  * std::size_t nb_cols = database.nbVariables ();
83  *
84  * // manually add a new row into the database
85  * std::vector<std::string> row( 8, "toto" ); // asia has 8 columns
86  * database.insertRow ( row );
87  * gum::learning::DBRow<gum::learning::DBTranslatedValue>
88  * dbrow ( 8, gum::learning::DBTranslatedValue { std::size_t(0) } );
89  * database.insertRow ( dbrow );
90  * // insert 4 rows in a single call:
91  * database.insertRows(
92  * std::vector<gum::learning::DBRow<gum::learning::DBTranslatedValue>>
93  * ( 4, dbrow ) );
94  *
95  * // erase some rows
96  * database.eraseRow ( 12 ); // erase the 13th row of the database
97  * database.eraseFirstRow (); // erase the first row of the database
98  * database.eraseLastRow (); // erase the last row of the database
99  * database.eraseFirstRows ( 2 ); // erase the first two rows
100  * database.eraseLastRows ( 3 ); // erase the last three rows
101  * database.eraseRows ( 2,4 ); // erase rows indexed from 2 to 4 (excluded)
102  *
103  * // parse the content of the database, the usual way
104  * for ( const auto& dbrow : database )
105  * std::cout << dbrow.row() << " weight: " << dbrow.weight() << std::endl;
106  *
107  * // ignore some columns of the database, i.e., remove them
108  * database.ignoreColumn ( 3 ); // remove the column X3 of the CSV file
109  * // now, the database contains columns 0, 1, 2, 4, 5, 6, 7 of the
110  * // CSV file. If we wish to remove Column X5 of the CSV file:
111  * database.ignoreColumn ( 5 ); // remove the column X5 of the CSV file
112  * // now, the database contains columns 0, 1, 2, 4, 6, 7 of the CSV file.
113  * // if we wish to remove the 5th column of the DatabaseTable, i.e.,
114  * // column #4 of the CSV, either we determine that this actually correspond
115  * // to column X6 of the CSV and we use database.ignoreColumn ( 6 ) or
116  * // we call:
117  * database.ignoreColumn ( 4, false ); // false => 4 = the 5th column of
118  * // the DatabaseTable, not the 5th column/variable of the CSV file
119  * // (remember that all column numbers start from 0).
120  *
121  * // display the columns of the CSV that were ignored and those that
122  * // were kept:
123  * std::vector<std::size_t> ignored_cols = database.ignoredColumns ();
124  * std::vector<std::size_t> kept_cols = database.inputColumns ();
125  *
126  * // parse the content of the database using handlers
127  * typename gum::learning::DatabaseTable<>::HandlerSafe handler( database );
128  * typename gum::learning::DatabaseTable<>::Handler uhandler( database );
129  * // by default, the handlers range over the whole database
130  *
131  * // change the range of rows handled by the DBHandler
132  * std::cout << handler.setRange ( 1, 40 ); // now parses rows [1,40)
133  * std::cout << handler.size (); // displays 39: rows 1,...,39
134  * std::cout << handler.DBSize (); // shows the number of rows in the database
135  * std::cout << handler.numRow (); // displays 0: the handler currently
136  * // points on the first row of its managed area [1,40)
137  *
138  * // move the handler to the next row
139  * handler.nextRow();
140  * std::cout << handler.numRow (); // displays 1: the handler points now
141  * // on the second row of its managed area. This corresponds to the third
142  * // DBRow of the database since the range of handler is [1,40)
143  * ++handler; // move again to the next row
144  * std::cout << handler.numRow (); // displays 2
145  * handler += 4; // advances the pointer by 4 rows
146  * std::cout << handler.numRow (); // displays 6
147  *
148  * // get the DBRow pointed to by the handler: this is the 7th DBRow
149  * // of the database
150  * const auto& xrow7 = handler.row (); // get the DBRow, unsafe version
151  * const auto& yrow7 = handler.rowSafe (); // get the DBRow, safe version
152  * const std::vector<gum::learning::DBCell>& xrow = xrow7.row ();
153  * const double xweight = xrow27.weight ();
154  *
155  * // another way to access the row
156  * const auto& zrow7 = *handler; // get the DBRow, unsafe version
157  *
158  * // check whether there exist other rows managed by the handler after
159  * // the current row
160  * bool has_rows = handler.hasRows (); // true: there remains 33 rows
161  *
162  * // makes the handler point again on the 2nd row of the database
163  * handler.reset (); // the handler points to the beginning of its area
164  * std::cout << handler.numRow (); // displays 0: the handler currently
165  * // points on the first row of its managed area [1,40)
166  *
167  * // see the variables' names, i.e., the names of the database's columns
168  * const auto& vars = handler.variableNames();
169  *
170  * // parse all the rows managed
171  * handler.reset ();
172  * for ( auto end = handler.end (); handler != end; ++handler )
173  * std::cout << handler.row ().weight () << std::endl;
174  *
175  * // another possibility:
176  * for ( const auto& row : handler )
177  * std::cout << row.weight () << std::endl;
178  *
179  *
180  * // clear the content of the database and update the database's
181  * // handlers
182  * database.clear ();
183  * @endcode
184  */
185  template < template < typename > class ALLOC = std::allocator >
187  public:
188  /// the type for the vectors used in the DatabaseTable
189  template < typename TX_DATA >
190  using DBVector = std::vector< TX_DATA, ALLOC< TX_DATA > >;
191 
192  /// a row of the database
193  template < typename TX_DATA >
194  using Row = DBRow< TX_DATA, ALLOC >;
195 
196  /// the type for the matrices stored into the database
197  template < typename TX_DATA >
199 
200  template < template < typename > class XALLOC >
201  using MissingValType = std::vector< std::string, XALLOC< std::string > >;
202 
203  /// the unsafe handler type
205 
206  /// the safe handler type
208 
210 
211  /// Types for STL compliance.
212  /// @{
215  using const_reference = const value_type&;
216  using pointer = value_type*;
217  using const_pointer = const value_type*;
218  using size_type = std::size_t;
219  using difference_type = std::ptrdiff_t;
220  using iterator = Handler;
223  /// @}
224 
225 
226  // ##########################################################################
227  /// @name Constructors / Destructors
228  // ##########################################################################
229  /// @{
230 
231  /// default constructor
232  template < template < typename > class XALLOC >
233  DatabaseTable(const MissingValType< XALLOC >& missing_symbols,
234  const DBTranslatorSet< ALLOC >& translators = DBTranslatorSet< ALLOC >(),
235  const allocator_type& alloc = allocator_type());
236 
237  /// default constructor
238  DatabaseTable(const DBTranslatorSet< ALLOC >& translators = DBTranslatorSet< ALLOC >(),
239  const allocator_type& alloc = allocator_type());
240 
241  /// copy constructor
242  DatabaseTable(const DatabaseTable< ALLOC >& from);
243 
244  /// copy constructor with a given allocator
245  DatabaseTable(const DatabaseTable< ALLOC >& from, const allocator_type& alloc);
246 
247  /// move constructor
248  DatabaseTable(DatabaseTable< ALLOC >&& from);
249 
250  /// move constructor with a given allocator
251  DatabaseTable(DatabaseTable< ALLOC >&& from, const allocator_type& alloc);
252 
253  /// virtual copy constructor
254  virtual DatabaseTable< ALLOC >* clone() const final;
255 
256  /// virtual copy constructor with a given allocator
257  virtual DatabaseTable< ALLOC >* clone(const allocator_type& alloc) const final;
258 
259  /// destructor
260  virtual ~DatabaseTable();
261 
262  /// @}
263 
264  // ##########################################################################
265  /// @name Operators
266  // ##########################################################################
267  /// @{
268 
269  /// copy operator
271 
272  /// move constructor
274 
275  /// @}
276 
277 
278  // ##########################################################################
279  /// @name Accessors / Modifiers
280  // ##########################################################################
281  /// @{
282 
283  /// insert a new translator into the database table
284  /** @param translator This translator is copied into the DatabaseTable
285  * @param input_column indicates which column in the original dataset
286  * (usually a CSV file) the translator will read
287  * @param unique_column indicates whether the input column can be read by
288  * several translators.
289  * @return the index of the translator within the set of translators
290  * @throws OperationNotAllowed if the input column is marked as ignored
291  * @throws DuplicateElement if there already exists a translator
292  * reading the input column passed in argument, and if the unique_column
293  * is set to true
294  * @warning if the database is not empty, i.e., it contains some records,
295  * all the column of the database corresponding to the new translator is
296  * filled with missing values.
297  */
298  std::size_t insertTranslator(const DBTranslator< ALLOC >& translator,
299  const std::size_t input_column,
300  const bool unique_column = true);
301 
302  /// insert a new translator into the database table
303  /** @param var the variable that will be contained into the translator
304  * @param input_column indicates which column in the original dataset
305  * (usually a CSV file) the translator will read
306  * @param unique_column indicates whether the input column can be read by
307  * several translators
308  * @param missing_symbols the set of symbols in the database
309  * representing missing values
310  * @return the index of the translator within the set of translators
311  * @throws OperationNotAllowed if the input column is marked as ignored
312  * @throws DuplicateElement if there already exists a translator
313  * reading the input column passed in argument, and if the unique_column
314  * is set to true
315  * @throws if the database is not empty, i.e., it contains some records,
316  * all the columns of the database corresponding to the new translator
317  * should be filled with missing values, which is impossible since we do
318  * not know which symbols correspond to missing values. Therefore, we
319  * raise a MissingValueInDatabase exception. If you do not want such a
320  * behavior, use method insertTranslator in which you specify the set of
321  * missing symbols.
322  */
323  std::size_t insertTranslator(const Variable& var,
324  const std::size_t input_column,
325  const bool unique_column = true);
326 
327  /// insert a new translator into the database table
328  /** @param var the variable that will be contained into the translator
329  * @param input_column indicates which column in the original dataset
330  * (usually a CSV file) the translator will read
331  * @param unique_column indicates whether the input column can be read by
332  * several translators
333  * @param missing_symbols the set of symbols in the database
334  * representing missing values
335  * @return the index of the translator within the set of translators
336  * @throws OperationNotAllowed if the input column is marked as ignored
337  * @throws DuplicateElement if there already exists a translator
338  * reading the input column passed in argument, and if the unique_column
339  * is set to true
340  * @warning if the database is not empty, i.e., it contains some records,
341  * all the column of the database corresponding to the new translator is
342  * filled with missing values.
343  */
344  template < template < typename > class XALLOC >
345  std::size_t
346  insertTranslator(const Variable& var,
347  const std::size_t input_column,
348  std::vector< std::string, XALLOC< std::string > > missing_symbols,
349  const bool unique_column = true);
350 
351  /** @brief erases either the kth translator or all those parsing the kth
352  * column of the input dataset
353  *
354  * Translators read an input dataset that is not necessarily the same as
355  * the content of the DatabaseTable. For instance, a CSV may contain 10
356  * columns, but if a DatabaseTable only contains two translators reading
357  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
358  * columns. When k_is_input_col is set to false, Parameter k passed in
359  * argument corresponds to either 0 or 1, i.e., to the index of one of
360  * these two output columns. When k_is_input_col is set to true, the
361  * translators to be erased are all those that parse the kth column of the
362  * input database.
363  * @warning if the translator does not exists, nothing is done. In
364  * particular, no exception is raised. */
365  void eraseTranslators(const std::size_t k, const bool k_is_input_col = false);
366 
367  /// returns the set of translators
368  const DBTranslatorSet< ALLOC >& translatorSet() const;
369 
370  /** @brief returns either the kth translator of the database table or the
371  * first one reading the kth column of the input database
372  *
373  * Translators read an input dataset that is not necessarily the same as
374  * the content of the DatabaseTable. For instance, a CSV may contain 10
375  * columns, but if a DatabaseTable only contains two translators reading
376  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
377  * columns. When k_is_input_col is set to false, Parameter k passed in
378  * argument corresponds to either 0 or 1, i.e., the index of one of these
379  * two columns. When k_is_input_col is set to true, the translator returned
380  * is the first one that parses the kth column of the input database.
381  * @throw UndefinedElement is raised if there is no translator
382  * corresponding to k. */
383  const DBTranslator< ALLOC >& translator(const std::size_t k,
384  const bool k_is_input_col = false) const;
385 
386  /** @brief returns either the kth variable of the database table or the
387  * first one corresponding to the kth column of the input database
388  *
389  * Translators read an input dataset that is not necessarily the same as
390  * the content of the DatabaseTable. For instance, a CSV may contain 10
391  * columns, but if a DatabaseTable only contains two translators reading
392  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
393  * columns. When k_is_input_col is set to false, Parameter k passed in
394  * argument corresponds to either 0 or 1, i.e., the index of one of these
395  * two columns. When k_is_input_col is set to true, the variable is that
396  * of the translator that parses the kth column of the input database.
397  * @throw UndefinedElement is raised if there is no translator
398  * corresponding to k. */
399  const Variable& variable(const std::size_t k, const bool k_is_input_col = false) const;
400 
401  /// sets the names of the variables
403 
404  /// sets the names of the variables
405  /** This method can be called in two different ways: either the names
406  * correspond precisely to the columns stored into the database table
407  * (in this case, parameter from_external_object is equal to false),
408  * or they correspond to the columns of an external database (e.g., a
409  * CSV file) from which we potentially excluded some columns and,
410  * consequently, the latter should not be taken into account (in this
411  * case, parameter from_external_object is equal to true). As an
412  * example, imagine that the database table is created from a CSV file
413  * with 5 columns named X0, X1, X2, X3 and X4 respectively. Suppose that
414  * we asked the database table to ignore columns X1 and X3. Then
415  * setVariableNames( { "X0", "X1", "X2", "X3", "X4" }, true ) will
416  * set the columns of the database table as { "X0", "X2", "X4" }. The
417  * same result could be obtained by executing
418  * setVariableNames( { "X0", "X2", "X4" }, false ), which specifies
419  * directly the set of names to retain in the database table.
420  * @param names the names of all the columns, including the ignored
421  * columns if from_external_object is set to true, else excluding
422  * them (i.e., this should precisely correspond to the columns stored
423  * into the database table).
424  * @param from_external_object a Boolean indicating whether parameter
425  * names includes the columns ignored by the database table (true) or
426  * not (false).
427  * @throw SizeError is raised if the names passed in arguments cannot be
428  * assigned to the columns of the DatabaseTable because the size of their
429  * vector is inadequate. */
430  virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > >& names,
431  const bool from_external_object = true) final;
432 
433  /** @brief makes the database table ignore from now on the kth column of
434  * the input dataset or the column parsed by the kth translator
435  *
436  * This method can be called in two different ways: either k refers to
437  * the current kth column of the database table (in this case parameter
438  * from_external_object is set to false), or k corresponds to the kth
439  * column of an original database used to fill the database table
440  * (in this case from_external_object is set to true). Depending on
441  * from_external_object's value, the ignored columns may differ. As an
442  * example, imagine that the database table is created from a CSV file
443  * with 5 columns named X0, X1, X2, X3 and X4 respectivly. Then a call to
444  * ignoreColumn ( 1, true ) will exclude column X1 from the database table.
445  * As a result, the database table columns are X0, X2, X3 and X4.
446  * Therefore, subsequently calling ignoreColumn ( 1, false ) will result
447  * in excluding X2 since X2 is the 2nd column (columns are indexed
448  * starting from 0). So, now the database table's columns are
449  * X0, X3 and X4. If, now, we call ignoreColumn ( 3, true ), this will
450  * remove column X3 because, in the original database, X3 was the 4th
451  * column.
452  *
453  * The method also erases all the translators corresponding to column k,
454  * if any. If the DatabaseTable contains some rows, then their column
455  * corresponding to k is removed. If the resulting DatabaseTable
456  * contains only empty rows, then those are removed.
457  *
458  * @param k the column to remove. See Method setVariableNames for a
459  * detailed description on how k is computed.
460  * @param from_external_object indicates whether k refers to the kth
461  * column of an original external database (true) or to the current kth
462  * column of the DatabaseTable (false).
463  * @throw UndefinedElement is raised if k refers to the position of a
464  * translator that does not exist (k >= number of translators). */
465  virtual void ignoreColumn(const std::size_t k, const bool from_external_object = true) final;
466 
467  /// returns the set of columns of the original dataset that are ignored
468  /** In this vector, all the column indices greater than or equal to its
469  * last element are also ignored. */
470  virtual const DBVector< std::size_t > ignoredColumns() const final;
471 
472  /** @brief returns the set of columns of the original dataset that are
473  * present in the DatabaseTable */
474  virtual const DBVector< std::size_t > inputColumns() const final;
475 
476  /** @brief returns the domain size of the kth variable of the database
477  * table or of that of the first one corresponding to the kth column of
478  * the input database
479  *
480  * Translators read an input dataset that is not necessarily the same as
481  * the content of the DatabaseTable. For instance, a CSV may contain 10
482  * columns, but if a DatabaseTable only contains two translators reading
483  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
484  * columns. When k_is_input_col is set to false, Parameter k passed in
485  * argument corresponds to either 0 or 1, i.e., the index of one of these
486  * two columns. When k_is_input_col is set to true, the variable is that
487  * of the translator that parses the kth column of the input database.
488  * @throw UndefinedElement is raised if there is no translator
489  * corresponding to k. */
490  std::size_t domainSize(const std::size_t k, const bool k_is_input_col = false) const;
491 
492  /// returns the domain sizes of all the variables in the database table
493  DBVector< std::size_t > domainSizes() const;
494 
495  /** @brief indicates whether a reordering is needed to sort the translations
496  * of the kth translator or those of the first translator parsing the kth
497  * column
498  *
499  * For a given translator, if the strings represented by the translations
500  * are only numbers, the translations are considered to be sorted if and
501  * only if they are sorted by increasing number. If the strings do not
502  * only represent numbers, then translations are considered to be sorted
503  * if and only if they are sorted lexicographically.
504  *
505  * When constructing dynamically its dictionary, the translator may
506  * assign wrong DBTranslatedValue values to strings. For instance, a
507  * translator reading sequentially integer strings 4, 1, 3, may map
508  * 4 into DBTranslatedValue{std::size_t(0)},
509  * 1 into DBTranslatedValue{std::size_t(1)} and
510  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
511  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
512  * a domain specified with increasing values. This requires a
513  * reordering. Method needsReodering() returns a Boolean indicating
514  * whether such a reordering should be performed or whether the current
515  * order is OK.
516  *
517  * Translators read an input dataset that is not necessarily the same as
518  * the content of the DatabaseTable. For instance, a CSV may contain 10
519  * columns, but if a DatabaseTable only contains two translators reading
520  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
521  * columns. When k_is_input_col is set to false, Parameter k passed in
522  * argument corresponds to either 0 or 1, i.e., the index of one of these
523  * two columns. When k_is_input_col is set to true, the translator to be
524  * reordered is that which parses the kth column of the input database.
525  * @throw UndefinedElement is raised if there is no translator
526  * corresponding to k. */
527  bool needsReordering(const std::size_t k, const bool k_is_input_col = false) const;
528 
529  /** @brief performs a reordering of the kth translator or
530  * of the first translator parsing the kth column of the input database
531  *
532  * For a given translator, if the strings represented by the translations
533  * are only numbers, the translations are considered to be sorted if and
534  * only if they are sorted by increasing number. If the strings do not
535  * only represent numbers, then translations are considered to be sorted
536  * if and only if they are sorted lexicographically.
537  *
538  * Translators read an input dataset that is not necessarily the same as
539  * the content of the DatabaseTable. For instance, a CSV may contain 10
540  * columns, but if a DatabaseTable only contains two translators reading
541  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
542  * columns. When k_is_input_col is set to false, Parameter k passed in
543  * argument corresponds to either 0 or 1, i.e., the index of one of these
544  * two columns. When k_is_input_col is set to true, the translator to be
545  * reordered is that which parses the kth column of the input database.
546  * @throw UndefinedElement is raised if there is no translator
547  * corresponding to k. */
548  void reorder(const std::size_t k, const bool k_is_input_col = false);
549 
550  /// performs a reordering of all the columns
551  /** For a given translator, if the strings represented by the translations
552  * are only numbers, the translations are considered to be sorted if and
553  * only if they are sorted by increasing number. If the strings do not
554  * only represent numbers, then translations are considered to be sorted
555  * if and only if they are sorted lexicographically. */
556  void reorder();
557 
558  /// insert a new row at the end of the database
560 
561  /// insert a new row at the end of the database
562  /** The new_row passed in argument is supposed to come from an external
563  * database. So it must contain data for the ignored columns.
564  * @throws SizeError is raised if the vector of string cannot be inserted
565  * in the DatabaseTable because its size does not allow a matching with
566  * the columns of the DatabaseTable (taking into account the ignored
567  * columns)
568  * @throws UnknownLabelInDatabase is raised if the translation of an
569  * element in the new row cannot be found and the corresponding translator
570  * is not in an editable dictionary mode.
571  * @throws SizeError is raised if the number of entries in the dictionary
572  * of a translator has already reached its maximum.
573  * @throws OperationNotAllowed exception is raised if the translation of
574  * an element in new_row cannot be found and the insertion of the string
575  * into the corresponding translator's dictionary fails because it would
576  * induce incoherent behavior (e.g., a DBTranslator4ContinuousVariable
577  * that contains a variable whose domain is [x,y] as well as a missing
578  * value symbol z \f$\in\f$ [x,y]).
579  * @throws TypeError is raised if the translation of an element in new_row
580  * cannot be found and the insertion of the string into the translator's
581  * dictionary fails due to str being impossible to be converted into an
582  * appropriate type.
583  */
584  virtual void insertRow(const std::vector< std::string, ALLOC< std::string > >& new_row) final;
585 
586  /// insert a new DBRow at the end of the database
587  /** Unlike methods insertRow for data whose type is different from
588  * DBTranslatedValue, this method assumes that the new row passed in
589  * argument does not contain any data of the ignored columns. So,
590  * basically, it could be copied as is into the database table.
591  * @throw SizeError is raised if the size of the new_row is not equal to
592  * the number of translators of the DatabaseTable
593  * @InvalidArgument is raised if at least one element of new_row does
594  * not belong to the domain of its corresponding translator.
595  */
596  virtual void insertRow(Row< DBTranslatedValue >&& new_row,
597  const IsMissing contains_missing_data) final;
598 
599  /// insert a new row at the end of the database
600  /** Unlike methods insertRow for data whose type is different from
601  * DBTranslatedValue, this method assumes that the new row passed in
602  * argument does not contain any data of the ignored columns. So,
603  * basically, it could be copied as is into the database table.
604  * @throw SizeError is raised if the size of the new_row is not equal to
605  * the number of translators of the DatabaseTable
606  * @InvalidArgument is raised if at least one element of new_row does
607  * not belong to the domain of its corresponding translator.
608  */
609  virtual void insertRow(const Row< DBTranslatedValue >& new_row,
610  const IsMissing contains_missing_data) final;
611 
612  /// insert a new DBRow of DBCells at the end of the database
613  /** The new_row passed in argument is supposed to come from an external
614  * database. So it must contain data for the ignored columns.
615  * @throw SizeError is raised if the vector of string cannot be inserted
616  * in the DatabaseTable because its size does not allow a matching with
617  * the columns of the DatabaseTable (taking into account the ignored
618  * columns) */
619  virtual void insertRow(const Row< DBCell >& new_row) final;
620 
621  /// insert a new DBRow of DBCells at the end of the database
622  /** The new_row passed in argument is supposed to come from an external
623  * database. So it must contain data for the ignored columns.
624  * @throw SizeError is raised if the vector of string cannot be inserted
625  * in the DatabaseTable because its size does not allow a matching with
626  * the columns of the DatabaseTable (taking into account the ignored
627  * columns) */
628  virtual void insertRow(Row< DBCell >&& new_row) final;
629 
630  /// insert a set of new DBRows at the end of the database
631  /** Unlike methods insertRows for data whose type is different from
632  * DBTranslatedValue, this method assumes that the new rows passed in
633  * argument do not contain any data of the ignored columns. So, basically,
634  * these rows could be copied as is into the database table.
635  * @throw SizeError is raised if the size of at least one row in new_rows
636  * is not equal to the number of translators in the DatabaseTable
637  * @InvalidArgument is raised if at least one element of new_row does
638  * not belong to the domain of its corresponding translator.
639  */
640  virtual void insertRows(Matrix< DBTranslatedValue >&& new_rows,
641  const DBVector< IsMissing >& rows_have_missing_vals) final;
642 
643  /// insert a set of new DBRows at the end of the database
644  /** Unlike methods insertRows for data whose type is different from
645  * DBTranslatedValue, this method assumes that the new rows passed in
646  * argument do not contain any data of the ignored columns. So, basically,
647  * these rows could be copied as is into the database table.
648  * @throw SizeError is raised if the size of at least one row in new_rows
649  * is not equal to the number of translators in the DatabaseTable
650  * @InvalidArgument is raised if at least one element of new_row does
651  * not belong to the domain of its corresponding translator.*/
652  virtual void insertRows(const Matrix< DBTranslatedValue >& new_rows,
653  const DBVector< IsMissing >& rows_have_missing_vals) final;
654 
655  /// insert a set of new DBRows at the end of the database
656  /** The new rows passed in argument are supposed to come from an external
657  * database. So they must contain data for the ignored columns.
658  * @throw SizeError is raised if the vector of string cannot be inserted
659  * in the DatabaseTable because its size does not allow a matching with
660  * the columns of the DatabaseTable (taking into account the ignored
661  * columns) */
662  virtual void insertRows(Matrix< DBCell >&& new_rows) final;
663 
664  /// insert a set of new DBRows at the end of the database
665  /** The new rows passed in argument are supposed to come from an external
666  * database. So they must contain data for the ignored columns.
667  * @throw SizeError is raised if the vector of string cannot be inserted
668  * in the DatabaseTable because its size does not allow a matching with
669  * the columns of the DatabaseTable (taking into account the ignored
670  * columns) */
671  virtual void insertRows(const Matrix< DBCell >& new_rows) final;
672 
673  /// erase the content of the database, including the names of the variables
674  virtual void clear() final;
675 
676  // substitutes the kth translator by another one
677  /* The method checks that:
678  * 1/ it is possible to get back the original values of the database
679  * for the rows already translated.
680  * 2/ that the new translator is capable of translating these values.
681  *
682  * If both checks passed, then it replaces the kth translator
683  * by the one passed in arguments and retranslates with it the kth
684  * cell of all the rows already contained in the database */
685 
686  /// @}
687 
688 
689 #ifndef DOXYGEN_SHOULD_SKIP_THIS
690 
691  private:
692  /// the set of translators used to convert the strings into floats
693  DBTranslatorSet< ALLOC > _translators_;
694 
695  /// the set of ignored columns asked by the user
696  Set< std::size_t, ALLOC< std::size_t > > _ignored_cols_;
697 
698  /** @brief check that a row's values are compatible with those of the
699  * translators' variables */
700  bool _isRowCompatible_(const Row< DBTranslatedValue >& row) const;
701 
702  /** @brief returns the index corresponding either to the kth translator or
703  * to the first one that parses the kth column of the input dataset
704  *
705  * @warning if the translator does not exists, the function returns an
706  * index which is greater than the number of translators */
707  std::size_t _getKthIndex_(const std::size_t k, const bool k_is_input_col) const;
708 
709  /** @brief returns the indices corresponding either to the kth translator
710  * or to all those that parse the kth column of the input dataset
711  *
712  * @warning the indices are sorted by deacreasing order */
713  DBVector< std::size_t > _getKthIndices_(const std::size_t k, const bool k_is_input_col) const;
714 
715  /// a method to process the rows of the database in multithreading
716  /** The function tries to execute function/functor exec_func using one
717  * or several threads. If an exception is raised by at least one thread,
718  * then function undo_func is executed to undo what exec_func
719  * did, and the exception is rethrown.
720  *
721  * @param exec_func this should be a function/functor/lambda that
722  * takes 2 arguments: the first one is an std::size_t containing the
723  * index of the first row that it should process, the second argument is
724  * an std::size_t equal to 1 + the index of the last row processed (so
725  * the processing is performed on [first,last). The return type of exec_func
726  * is a void. If a thread executing exec_func raises an exception, then
727  * before exiting, it should undo what it did.
728  * @param undo_func a Function/functor/lambda with the same
729  * prototype as exec_func. If a thread raises an exception, those that
730  * did not raise exceptions should undo what they did in order to restore
731  * the state that the database had before the execution of the thread. After
732  * calling undo_func, they should have restored this state.
733  */
734  template < typename Functor1, typename Functor2 >
735  void _threadProcessDatabase_(Functor1& exec_func, Functor2& undo_func);
736 
737 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
738  };
739 
740  } /* namespace learning */
741 
742 } /* namespace gum */
743 
744 /// always include the templated implementations
745 #include <agrum/tools/database/databaseTable_tpl.h>
746 
747 #endif /* GUM_DATABASE_TABLE_H */
virtual ~DatabaseTable()
destructor
virtual void insertRow(const Row< DBTranslatedValue > &new_row, const IsMissing contains_missing_data) final
insert a new row at the end of the database
virtual void insertRows(Matrix< DBCell > &&new_rows) final
insert a set of new DBRows at the end of the database
DatabaseTable(const DatabaseTable< ALLOC > &from)
copy constructor
virtual void ignoreColumn(const std::size_t k, const bool from_external_object=true) final
makes the database table ignore from now on the kth column of the input dataset or the column parsed ...
DBVector< std::size_t > domainSizes() const
returns the domain sizes of all the variables in the database table
const DBTranslatorSet< ALLOC > & translatorSet() const
returns the set of translators
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
virtual void insertRows(const Matrix< DBTranslatedValue > &new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
virtual void insertRow(const std::vector< std::string, ALLOC< std::string > > &new_row) final
insert a new row at the end of the database
DatabaseTable(DatabaseTable< ALLOC > &&from)
move constructor
DatabaseTable(const DatabaseTable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
bool needsReordering(const std::size_t k, const bool k_is_input_col=false) const
indicates whether a reordering is needed to sort the translations of the kth translator or those of t...
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, const bool unique_column=true)
insert a new translator into the database table
virtual DatabaseTable< ALLOC > * clone() const final
virtual copy constructor
virtual void insertRow(const Row< DBCell > &new_row) final
insert a new DBRow of DBCells at the end of the database
virtual const DBVector< std::size_t > inputColumns() const final
returns the set of columns of the original dataset that are present in the DatabaseTable ...
const DBTranslator< ALLOC > & translator(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth translator of the database table or the first one reading the kth column of th...
virtual void insertRow(Row< DBTranslatedValue > &&new_row, const IsMissing contains_missing_data) final
insert a new DBRow at the end of the database
std::size_t domainSize(const std::size_t k, const bool k_is_input_col=false) const
returns the domain size of the kth variable of the database table or of that of the first one corresp...
virtual const DBVector< std::size_t > ignoredColumns() const final
returns the set of columns of the original dataset that are ignored
DatabaseTable< ALLOC > & operator=(const DatabaseTable< ALLOC > &from)
copy operator
DatabaseTable(DatabaseTable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual void insertRow(Row< DBCell > &&new_row) final
insert a new DBRow of DBCells at the end of the database
const Variable & variable(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth variable of the database table or the first one corresponding to the kth colum...
virtual void insertRows(Matrix< DBTranslatedValue > &&new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, std::vector< std::string, XALLOC< std::string > > missing_symbols, const bool unique_column=true)
insert a new translator into the database table
The class representing a tabular database as used by learning tasks.
virtual void clear() final
erase the content of the database, including the names of the variables
virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > > &names, const bool from_external_object=true) final
sets the names of the variables
DatabaseTable< ALLOC > & operator=(DatabaseTable< ALLOC > &&from)
move constructor
virtual void insertRows(const Matrix< DBCell > &new_rows) final
insert a set of new DBRows at the end of the database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DatabaseTable(const MissingValType< XALLOC > &missing_symbols, const DBTranslatorSet< ALLOC > &translators=DBTranslatorSet< ALLOC >(), const allocator_type &alloc=allocator_type())
default constructor
void reorder(const std::size_t k, const bool k_is_input_col=false)
performs a reordering of the kth translator or of the first translator parsing the kth column of the ...
void eraseTranslators(const std::size_t k, const bool k_is_input_col=false)
erases either the kth translator or all those parsing the kth column of the input dataset ...
virtual DatabaseTable< ALLOC > * clone(const allocator_type &alloc) const final
virtual copy constructor with a given allocator
void reorder()
performs a reordering of all the columns