aGrUM  0.21.0
a C++ library for (probabilistic) graphical models
databaseTable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class representing a tabular database stored in RAM
24  *
25  * Class DatabaseTable represents a tabular database that stores in the
26  * computer's random access memory (RAM) its content as a vector of DBRows
27  * of DBTranslatedValue instances.
28  *
29  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
30  */
31 #ifndef GUM_DATABASE_TABLE_H
32 #define GUM_DATABASE_TABLE_H
33 
34 #include <numeric>
35 #include <algorithm>
36 #include <functional>
37 #include <exception>
38 #include <vector>
39 
40 #include <agrum/agrum.h>
41 #include <agrum/tools/core/math/math_utils.h>
42 #include <agrum/tools/core/set.h>
43 #include <agrum/tools/core/thread.h>
44 #include <agrum/tools/database/DBCell.h>
45 #include <agrum/tools/database/DBRow.h>
46 #include <agrum/tools/database/DBTranslatedValue.h>
47 #include <agrum/tools/database/IDatabaseTable.h>
48 #include <agrum/tools/database/DBTranslatorSet.h>
49 #include <agrum/tools/database/DBTranslator4ContinuousVariable.h>
50 
51 namespace gum {
52 
53  namespace learning {
54 
55  /** @class DatabaseTable
56  * @brief The class representing a tabular database as used by learning tasks.
57  * @headerfile databaseTable.h <agrum/tools/database/databaseTable.h>
58  * @ingroup learning_database
59  *
60  * Class DatabaseTable represents a tabular database that stores in the
61  * computer's random access memory (RAM) its content as a vector of DBRows
62  * of DBTranslatedValue instances. This class is very well suited for fast
63  * learning algorithms.
64  *
65  * @par Usage example:
66  * @code
67  * // create the database from a CSV. This is not compulsory for
68  * // DatabaseTable instances, but this is how we usually create
69  * // DatabaseTable instances
70  * gum::learning::DBInitializerFromCSV<> initializer ( "asia.csv" );
71  * const auto& var_names = initializer.variableNames ();
72  * gum::learning::DBTranslatorSet<> translator_set;
73  * gum::learning::DBTranslator4LabelizedVariable<> translator;
74  * for ( std::size_t i = 0; i < var_names.size(); ++i )
75  * translator_set.insertTranslator ( translator, i );
76  * gum::learning::DatabaseTable<> database ( translator_set );
77  * database.setVariableNames( initializer.variableNames () );
78  *
79  * // here, database contains the content of the asia.csv file.
80  * // determine how many columns and rows the database contains
81  * std::size_t nb_rows = database.nbRows();
82  * std::size_t nb_cols = database.nbVariables ();
83  *
84  * // manually add a new row into the database
85  * std::vector<std::string> row( 8, "toto" ); // asia has 8 columns
86  * database.insertRow ( row );
87  * gum::learning::DBRow<gum::learning::DBTranslatedValue>
88  * dbrow ( 8, gum::learning::DBTranslatedValue { std::size_t(0) } );
89  * database.insertRow ( dbrow );
90  * // insert 4 rows in a single call:
91  * database.insertRows(
92  * std::vector<gum::learning::DBRow<gum::learning::DBTranslatedValue>>
93  * ( 4, dbrow ) );
94  *
95  * // erase some rows
96  * database.eraseRow ( 12 ); // erase the 13th row of the database
97  * database.eraseFirstRow (); // erase the first row of the database
98  * database.eraseLastRow (); // erase the last row of the database
99  * database.eraseFirstRows ( 2 ); // erase the first two rows
100  * database.eraseLastRows ( 3 ); // erase the last three rows
101  * database.eraseRows ( 2,4 ); // erase rows indexed from 2 to 4 (excluded)
102  *
103  * // parse the content of the database, the usual way
104  * for ( const auto& dbrow : database )
105  * std::cout << dbrow.row() << " weight: " << dbrow.weight() << std::endl;
106  *
107  * // ignore some columns of the database, i.e., remove them
108  * database.ignoreColumn ( 3 ); // remove the column X3 of the CSV file
109  * // now, the database contains columns 0, 1, 2, 4, 5, 6, 7 of the
110  * // CSV file. If we wish to remove Column X5 of the CSV file:
111  * database.ignoreColumn ( 5 ); // remove the column X5 of the CSV file
112  * // now, the database contains columns 0, 1, 2, 4, 6, 7 of the CSV file.
113  * // if we wish to remove the 5th column of the DatabaseTable, i.e.,
114  * // column #4 of the CSV, either we determine that this actually correspond
115  * // to column X6 of the CSV and we use database.ignoreColumn ( 6 ) or
116  * // we call:
117  * database.ignoreColumn ( 4, false ); // false => 4 = the 5th column of
118  * // the DatabaseTable, not the 5th column/variable of the CSV file
119  * // (remember that all column numbers start from 0).
120  *
121  * // display the columns of the CSV that were ignored and those that
122  * // were kept:
123  * std::vector<std::size_t> ignored_cols = database.ignoredColumns ();
124  * std::vector<std::size_t> kept_cols = database.inputColumns ();
125  *
126  * // parse the content of the database using handlers
127  * typename gum::learning::DatabaseTable<>::HandlerSafe handler( database );
128  * typename gum::learning::DatabaseTable<>::Handler uhandler( database );
129  * // by default, the handlers range over the whole database
130  *
131  * // change the range of rows handled by the DBHandler
132  * std::cout << handler.setRange ( 1, 40 ); // now parses rows [1,40)
133  * std::cout << handler.size (); // displays 39: rows 1,...,39
134  * std::cout << handler.DBSize (); // shows the number of rows in the database
135  * std::cout << handler.numRow (); // displays 0: the handler currently
136  * // points on the first row of its managed area [1,40)
137  *
138  * // move the handler to the next row
139  * handler.nextRow();
140  * std::cout << handler.numRow (); // displays 1: the handler points now
141  * // on the second row of its managed area. This corresponds to the third
142  * // DBRow of the database since the range of handler is [1,40)
143  * ++handler; // move again to the next row
144  * std::cout << handler.numRow (); // displays 2
145  * handler += 4; // advances the pointer by 4 rows
146  * std::cout << handler.numRow (); // displays 6
147  *
148  * // get the DBRow pointed to by the handler: this is the 7th DBRow
149  * // of the database
150  * const auto& xrow7 = handler.row (); // get the DBRow, unsafe version
151  * const auto& yrow7 = handler.rowSafe (); // get the DBRow, safe version
152  * const std::vector<gum::learning::DBCell>& xrow = xrow7.row ();
153  * const double xweight = xrow27.weight ();
154  *
155  * // another way to access the row
156  * const auto& zrow7 = *handler; // get the DBRow, unsafe version
157  *
158  * // check whether there exist other rows managed by the handler after
159  * // the current row
160  * bool has_rows = handler.hasRows (); // true: there remains 33 rows
161  *
162  * // makes the handler point again on the 2nd row of the database
163  * handler.reset (); // the handler points to the beginning of its area
164  * std::cout << handler.numRow (); // displays 0: the handler currently
165  * // points on the first row of its managed area [1,40)
166  *
167  * // see the variables' names, i.e., the names of the database's columns
168  * const auto& vars = handler.variableNames();
169  *
170  * // parse all the rows managed
171  * handler.reset ();
172  * for ( auto end = handler.end (); handler != end; ++handler )
173  * std::cout << handler.row ().weight () << std::endl;
174  *
175  * // another possibility:
176  * for ( const auto& row : handler )
177  * std::cout << row.weight () << std::endl;
178  *
179  *
180  * // clear the content of the database and update the database's
181  * // handlers
182  * database.clear ();
183  * @endcode
184  */
185  template < template < typename > class ALLOC = std::allocator >
187  public:
188  /// the type for the vectors used in the DatabaseTable
189  template < typename TX_DATA >
190  using DBVector = std::vector< TX_DATA, ALLOC< TX_DATA > >;
191 
192  /// a row of the database
193  template < typename TX_DATA >
194  using Row = DBRow< TX_DATA, ALLOC >;
195 
196  /// the type for the matrices stored into the database
197  template < typename TX_DATA >
199 
200  template < template < typename > class XALLOC >
201  using MissingValType = std::vector< std::string, XALLOC< std::string > >;
202 
203  /// the unsafe handler type
205 
206  /// the safe handler type
208 
210 
211  /// Types for STL compliance.
212  /// @{
215  using const_reference = const value_type&;
216  using pointer = value_type*;
217  using const_pointer = const value_type*;
218  using size_type = std::size_t;
219  using difference_type = std::ptrdiff_t;
220  using iterator = Handler;
223  /// @}
224 
225 
226  // ##########################################################################
227  /// @name Constructors / Destructors
228  // ##########################################################################
229  /// @{
230 
231  /// default constructor
232  template < template < typename > class XALLOC >
233  DatabaseTable(const MissingValType< XALLOC >& missing_symbols,
234  const DBTranslatorSet< ALLOC >& translators = DBTranslatorSet< ALLOC >(),
235  const allocator_type& alloc = allocator_type());
236 
237  /// default constructor
238  DatabaseTable(const DBTranslatorSet< ALLOC >& translators = DBTranslatorSet< ALLOC >(),
239  const allocator_type& alloc = allocator_type());
240 
241  /// copy constructor
242  DatabaseTable(const DatabaseTable< ALLOC >& from);
243 
244  /// copy constructor with a given allocator
245  DatabaseTable(const DatabaseTable< ALLOC >& from, const allocator_type& alloc);
246 
247  /// move constructor
248  DatabaseTable(DatabaseTable< ALLOC >&& from);
249 
250  /// move constructor with a given allocator
251  DatabaseTable(DatabaseTable< ALLOC >&& from, const allocator_type& alloc);
252 
253  /// virtual copy constructor
254  virtual DatabaseTable< ALLOC >* clone() const final;
255 
256  /// virtual copy constructor with a given allocator
257  virtual DatabaseTable< ALLOC >* clone(const allocator_type& alloc) const final;
258 
259  /// destructor
260  virtual ~DatabaseTable();
261 
262  /// @}
263 
264  // ##########################################################################
265  /// @name Operators
266  // ##########################################################################
267  /// @{
268 
269  /// copy operator
271 
272  /// move constructor
274 
275  /// @}
276 
277 
278  // ##########################################################################
279  /// @name Accessors / Modifiers
280  // ##########################################################################
281  /// @{
282 
283  /// insert a new translator into the database table
284  /** @param translator This translator is copied into the DatabaseTable
285  * @param input_column indicates which column in the original dataset
286  * (usually a CSV file) the translator will read
287  * @param unique_column indicates whether the input column can be read by
288  * several translators.
289  * @return the index of the translator within the set of translators
290  * @throws OperationNotAllowed if the input column is marked as ignored
291  * @throws DuplicateElement if there already exists a translator
292  * reading the input column passed in argument, and if the unique_column
293  * is set to true
294  * @warning if the database is not empty, i.e., it contains some records,
295  * all the column of the database corresponding to the new translator is
296  * filled with missing values.
297  */
298  std::size_t insertTranslator(const DBTranslator< ALLOC >& translator,
299  const std::size_t input_column,
300  const bool unique_column = true);
301 
302  /// insert a new translator into the database table
303  /** @param var the variable that will be contained into the translator
304  * @param input_column indicates which column in the original dataset
305  * (usually a CSV file) the translator will read
306  * @param unique_column indicates whether the input column can be read by
307  * several translators
308  * @param missing_symbols the set of symbols in the database
309  * representing missing values
310  * @return the index of the translator within the set of translators
311  * @throws OperationNotAllowed if the input column is marked as ignored
312  * @throws DuplicateElement if there already exists a translator
313  * reading the input column passed in argument, and if the unique_column
314  * is set to true
315  * @throws if the database is not empty, i.e., it contains some records,
316  * all the columns of the database corresponding to the new translator
317  * should be filled with missing values, which is impossible since we do
318  * not know which symbols correspond to missing values. Therefore, we
319  * raise a MissingValueInDatabase exception. If you do not want such a
320  * behavior, use method insertTranslator in which you specify the set of
321  * missing symbols.
322  */
323  std::size_t insertTranslator(const Variable& var,
324  const std::size_t input_column,
325  const bool unique_column = true);
326 
327  /// insert a new translator into the database table
328  /** @param var the variable that will be contained into the translator
329  * @param input_column indicates which column in the original dataset
330  * (usually a CSV file) the translator will read
331  * @param unique_column indicates whether the input column can be read by
332  * several translators
333  * @param missing_symbols the set of symbols in the database
334  * representing missing values
335  * @return the index of the translator within the set of translators
336  * @throws OperationNotAllowed if the input column is marked as ignored
337  * @throws DuplicateElement if there already exists a translator
338  * reading the input column passed in argument, and if the unique_column
339  * is set to true
340  * @warning if the database is not empty, i.e., it contains some records,
341  * all the column of the database corresponding to the new translator is
342  * filled with missing values.
343  */
344  template < template < typename > class XALLOC >
345  std::size_t
346  insertTranslator(const Variable& var,
347  const std::size_t input_column,
348  std::vector< std::string, XALLOC< std::string > > missing_symbols,
349  const bool unique_column = true);
350 
351  /** @brief erases either the kth translator or all those parsing the kth
352  * column of the input dataset
353  *
354  * Translators read an input dataset that is not necessarily the same as
355  * the content of the DatabaseTable. For instance, a CSV may contain 10
356  * columns, but if a DatabaseTable only contains two translators reading
357  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
358  * columns. When k_is_input_col is set to false, Parameter k passed in
359  * argument corresponds to either 0 or 1, i.e., to the index of one of
360  * these two output columns. When k_is_input_col is set to true, the
361  * translators to be erased are all those that parse the kth column of the
362  * input database.
363  * @warning if the translator does not exists, nothing is done. In
364  * particular, no exception is raised. */
365  void eraseTranslators(const std::size_t k, const bool k_is_input_col = false);
366 
367  /// change the translator of a database column
368  /**
369  * When changing the translator for a column, we update the content of the database.
370  * @param new_translator the new translator to use
371  * @param k either the column in the DatabaseTable (if k_is_input_col = false, the
372  * default) or the first column in the DatabaseTable which corresponds to the kth
373  * column of the input CSV (if k_is_input_col = true)
374  * @param k_is_input_col see Parameter k
375  * @warning if the translator does not exists, nothing is done. In
376  * particular, no exception is raised.
377  */
378  void changeTranslator(const DBTranslator< ALLOC >& new_translator,
379  const std::size_t k,
380  const bool k_is_input_col = false);
381 
382  /// change the translator of a database column
383  /**
384  * When changing the translator for a column, we update the content of the database.
385  * @param new_var The variable corresponding to the new translator
386  * @param k k either the column in the DatabaseTable (if k_is_input_col = false, the
387  * default) or the first column in the DatabaseTable which corresponds to the kth
388  * column of the input CSV (if k_is_input_col = true)
389  * @param k_is_input_col see Parameter k
390  * @warning if the translator does not exists, nothing is done. In
391  * particular, no exception is raised.
392  */
393  void changeTranslator(const Variable& var,
394  const std::size_t k,
395  const bool k_is_input_col = false);
396 
397  /// returns the set of translators
398  const DBTranslatorSet< ALLOC >& translatorSet() const;
399 
400  /** @brief returns either the kth translator of the database table or the
401  * first one reading the kth column of the input database
402  *
403  * Translators read an input dataset that is not necessarily the same as
404  * the content of the DatabaseTable. For instance, a CSV may contain 10
405  * columns, but if a DatabaseTable only contains two translators reading
406  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
407  * columns. When k_is_input_col is set to false, Parameter k passed in
408  * argument corresponds to either 0 or 1, i.e., the index of one of these
409  * two columns. When k_is_input_col is set to true, the translator returned
410  * is the first one that parses the kth column of the input database.
411  * @throw UndefinedElement is raised if there is no translator
412  * corresponding to k. */
413  const DBTranslator< ALLOC >& translator(const std::size_t k,
414  const bool k_is_input_col = false) const;
415 
416  /** @brief returns either the kth variable of the database table or the
417  * first one corresponding to the kth column of the input database
418  *
419  * Translators read an input dataset that is not necessarily the same as
420  * the content of the DatabaseTable. For instance, a CSV may contain 10
421  * columns, but if a DatabaseTable only contains two translators reading
422  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
423  * columns. When k_is_input_col is set to false, Parameter k passed in
424  * argument corresponds to either 0 or 1, i.e., the index of one of these
425  * two columns. When k_is_input_col is set to true, the variable is that
426  * of the translator that parses the kth column of the input database.
427  * @throw UndefinedElement is raised if there is no translator
428  * corresponding to k. */
429  const Variable& variable(const std::size_t k, const bool k_is_input_col = false) const;
430 
431  /// sets the names of the variables
433 
434  /// sets the names of the variables
435  /** This method can be called in two different ways: either the names
436  * correspond precisely to the columns stored into the database table
437  * (in this case, parameter from_external_object is equal to false),
438  * or they correspond to the columns of an external database (e.g., a
439  * CSV file) from which we potentially excluded some columns and,
440  * consequently, the latter should not be taken into account (in this
441  * case, parameter from_external_object is equal to true). As an
442  * example, imagine that the database table is created from a CSV file
443  * with 5 columns named X0, X1, X2, X3 and X4 respectively. Suppose that
444  * we asked the database table to ignore columns X1 and X3. Then
445  * setVariableNames( { "X0", "X1", "X2", "X3", "X4" }, true ) will
446  * set the columns of the database table as { "X0", "X2", "X4" }. The
447  * same result could be obtained by executing
448  * setVariableNames( { "X0", "X2", "X4" }, false ), which specifies
449  * directly the set of names to retain in the database table.
450  * @param names the names of all the columns, including the ignored
451  * columns if from_external_object is set to true, else excluding
452  * them (i.e., this should precisely correspond to the columns stored
453  * into the database table).
454  * @param from_external_object a Boolean indicating whether parameter
455  * names includes the columns ignored by the database table (true) or
456  * not (false).
457  * @throw SizeError is raised if the names passed in arguments cannot be
458  * assigned to the columns of the DatabaseTable because the size of their
459  * vector is inadequate. */
460  virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > >& names,
461  const bool from_external_object = true) final;
462 
463  /** @brief makes the database table ignore from now on the kth column of
464  * the input dataset or the column parsed by the kth translator
465  *
466  * This method can be called in two different ways: either k refers to
467  * the current kth column of the database table (in this case parameter
468  * from_external_object is set to false), or k corresponds to the kth
469  * column of an original database used to fill the database table
470  * (in this case from_external_object is set to true). Depending on
471  * from_external_object's value, the ignored columns may differ. As an
472  * example, imagine that the database table is created from a CSV file
473  * with 5 columns named X0, X1, X2, X3 and X4 respectivly. Then a call to
474  * ignoreColumn ( 1, true ) will exclude column X1 from the database table.
475  * As a result, the database table columns are X0, X2, X3 and X4.
476  * Therefore, subsequently calling ignoreColumn ( 1, false ) will result
477  * in excluding X2 since X2 is the 2nd column (columns are indexed
478  * starting from 0). So, now the database table's columns are
479  * X0, X3 and X4. If, now, we call ignoreColumn ( 3, true ), this will
480  * remove column X3 because, in the original database, X3 was the 4th
481  * column.
482  *
483  * The method also erases all the translators corresponding to column k,
484  * if any. If the DatabaseTable contains some rows, then their column
485  * corresponding to k is removed. If the resulting DatabaseTable
486  * contains only empty rows, then those are removed.
487  *
488  * @param k the column to remove. See Method setVariableNames for a
489  * detailed description on how k is computed.
490  * @param from_external_object indicates whether k refers to the kth
491  * column of an original external database (true) or to the current kth
492  * column of the DatabaseTable (false).
493  * @throw UndefinedElement is raised if k refers to the position of a
494  * translator that does not exist (k >= number of translators). */
495  virtual void ignoreColumn(const std::size_t k, const bool from_external_object = true) final;
496 
497  /// returns the set of columns of the original dataset that are ignored
498  /** In this vector, all the column indices greater than or equal to its
499  * last element are also ignored. */
500  virtual const DBVector< std::size_t > ignoredColumns() const final;
501 
502  /** @brief returns the set of columns of the original dataset that are
503  * present in the DatabaseTable */
504  virtual const DBVector< std::size_t > inputColumns() const final;
505 
506  /** @brief returns the domain size of the kth variable of the database
507  * table or of that of the first one corresponding to the kth column of
508  * the input database
509  *
510  * Translators read an input dataset that is not necessarily the same as
511  * the content of the DatabaseTable. For instance, a CSV may contain 10
512  * columns, but if a DatabaseTable only contains two translators reading
513  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
514  * columns. When k_is_input_col is set to false, Parameter k passed in
515  * argument corresponds to either 0 or 1, i.e., the index of one of these
516  * two columns. When k_is_input_col is set to true, the variable is that
517  * of the translator that parses the kth column of the input database.
518  * @throw UndefinedElement is raised if there is no translator
519  * corresponding to k. */
520  std::size_t domainSize(const std::size_t k, const bool k_is_input_col = false) const;
521 
522  /// returns the domain sizes of all the variables in the database table
523  DBVector< std::size_t > domainSizes() const;
524 
525  /** @brief indicates whether a reordering is needed to sort the translations
526  * of the kth translator or those of the first translator parsing the kth
527  * column
528  *
529  * For a given translator, if the strings represented by the translations
530  * are only numbers, the translations are considered to be sorted if and
531  * only if they are sorted by increasing number. If the strings do not
532  * only represent numbers, then translations are considered to be sorted
533  * if and only if they are sorted lexicographically.
534  *
535  * When constructing dynamically its dictionary, the translator may
536  * assign wrong DBTranslatedValue values to strings. For instance, a
537  * translator reading sequentially integer strings 4, 1, 3, may map
538  * 4 into DBTranslatedValue{std::size_t(0)},
539  * 1 into DBTranslatedValue{std::size_t(1)} and
540  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
541  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
542  * a domain specified with increasing values. This requires a
543  * reordering. Method needsReodering() returns a Boolean indicating
544  * whether such a reordering should be performed or whether the current
545  * order is OK.
546  *
547  * Translators read an input dataset that is not necessarily the same as
548  * the content of the DatabaseTable. For instance, a CSV may contain 10
549  * columns, but if a DatabaseTable only contains two translators reading
550  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
551  * columns. When k_is_input_col is set to false, Parameter k passed in
552  * argument corresponds to either 0 or 1, i.e., the index of one of these
553  * two columns. When k_is_input_col is set to true, the translator to be
554  * reordered is that which parses the kth column of the input database.
555  * @throw UndefinedElement is raised if there is no translator
556  * corresponding to k. */
557  bool needsReordering(const std::size_t k, const bool k_is_input_col = false) const;
558 
559  /** @brief performs a reordering of the kth translator or
560  * of the first translator parsing the kth column of the input database
561  *
562  * For a given translator, if the strings represented by the translations
563  * are only numbers, the translations are considered to be sorted if and
564  * only if they are sorted by increasing number. If the strings do not
565  * only represent numbers, then translations are considered to be sorted
566  * if and only if they are sorted lexicographically.
567  *
568  * Translators read an input dataset that is not necessarily the same as
569  * the content of the DatabaseTable. For instance, a CSV may contain 10
570  * columns, but if a DatabaseTable only contains two translators reading
571  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
572  * columns. When k_is_input_col is set to false, Parameter k passed in
573  * argument corresponds to either 0 or 1, i.e., the index of one of these
574  * two columns. When k_is_input_col is set to true, the translator to be
575  * reordered is that which parses the kth column of the input database.
576  * @throw UndefinedElement is raised if there is no translator
577  * corresponding to k. */
578  void reorder(const std::size_t k, const bool k_is_input_col = false);
579 
580  /// performs a reordering of all the columns
581  /** For a given translator, if the strings represented by the translations
582  * are only numbers, the translations are considered to be sorted if and
583  * only if they are sorted by increasing number. If the strings do not
584  * only represent numbers, then translations are considered to be sorted
585  * if and only if they are sorted lexicographically. */
586  void reorder();
587 
588  /// insert a new row at the end of the database
590 
591  /// insert a new row at the end of the database
592  /** The new_row passed in argument is supposed to come from an external
593  * database. So it must contain data for the ignored columns.
594  * @throws SizeError is raised if the vector of string cannot be inserted
595  * in the DatabaseTable because its size does not allow a matching with
596  * the columns of the DatabaseTable (taking into account the ignored
597  * columns)
598  * @throws UnknownLabelInDatabase is raised if the translation of an
599  * element in the new row cannot be found and the corresponding translator
600  * is not in an editable dictionary mode.
601  * @throws SizeError is raised if the number of entries in the dictionary
602  * of a translator has already reached its maximum.
603  * @throws OperationNotAllowed exception is raised if the translation of
604  * an element in new_row cannot be found and the insertion of the string
605  * into the corresponding translator's dictionary fails because it would
606  * induce incoherent behavior (e.g., a DBTranslator4ContinuousVariable
607  * that contains a variable whose domain is [x,y] as well as a missing
608  * value symbol z \f$\in\f$ [x,y]).
609  * @throws TypeError is raised if the translation of an element in new_row
610  * cannot be found and the insertion of the string into the translator's
611  * dictionary fails due to str being impossible to be converted into an
612  * appropriate type.
613  */
614  virtual void insertRow(const std::vector< std::string, ALLOC< std::string > >& new_row) final;
615 
616  /// insert a new DBRow at the end of the database
617  /** Unlike methods insertRow for data whose type is different from
618  * DBTranslatedValue, this method assumes that the new row passed in
619  * argument does not contain any data of the ignored columns. So,
620  * basically, it could be copied as is into the database table.
621  * @throw SizeError is raised if the size of the new_row is not equal to
622  * the number of translators of the DatabaseTable
623  * @InvalidArgument is raised if at least one element of new_row does
624  * not belong to the domain of its corresponding translator.
625  */
626  virtual void insertRow(Row< DBTranslatedValue >&& new_row,
627  const IsMissing contains_missing_data) final;
628 
629  /// insert a new row at the end of the database
630  /** Unlike methods insertRow for data whose type is different from
631  * DBTranslatedValue, this method assumes that the new row passed in
632  * argument does not contain any data of the ignored columns. So,
633  * basically, it could be copied as is into the database table.
634  * @throw SizeError is raised if the size of the new_row is not equal to
635  * the number of translators of the DatabaseTable
636  * @InvalidArgument is raised if at least one element of new_row does
637  * not belong to the domain of its corresponding translator.
638  */
639  virtual void insertRow(const Row< DBTranslatedValue >& new_row,
640  const IsMissing contains_missing_data) final;
641 
642  /// insert a new DBRow of DBCells at the end of the database
643  /** The new_row passed in argument is supposed to come from an external
644  * database. So it must contain data for the ignored columns.
645  * @throw SizeError is raised if the vector of string cannot be inserted
646  * in the DatabaseTable because its size does not allow a matching with
647  * the columns of the DatabaseTable (taking into account the ignored
648  * columns) */
649  virtual void insertRow(const Row< DBCell >& new_row) final;
650 
651  /// insert a new DBRow of DBCells at the end of the database
652  /** The new_row passed in argument is supposed to come from an external
653  * database. So it must contain data for the ignored columns.
654  * @throw SizeError is raised if the vector of string cannot be inserted
655  * in the DatabaseTable because its size does not allow a matching with
656  * the columns of the DatabaseTable (taking into account the ignored
657  * columns) */
658  virtual void insertRow(Row< DBCell >&& new_row) final;
659 
660  /// insert a set of new DBRows at the end of the database
661  /** Unlike methods insertRows for data whose type is different from
662  * DBTranslatedValue, this method assumes that the new rows passed in
663  * argument do not contain any data of the ignored columns. So, basically,
664  * these rows could be copied as is into the database table.
665  * @throw SizeError is raised if the size of at least one row in new_rows
666  * is not equal to the number of translators in the DatabaseTable
667  * @InvalidArgument is raised if at least one element of new_row does
668  * not belong to the domain of its corresponding translator.
669  */
670  virtual void insertRows(Matrix< DBTranslatedValue >&& new_rows,
671  const DBVector< IsMissing >& rows_have_missing_vals) final;
672 
673  /// insert a set of new DBRows at the end of the database
674  /** Unlike methods insertRows for data whose type is different from
675  * DBTranslatedValue, this method assumes that the new rows passed in
676  * argument do not contain any data of the ignored columns. So, basically,
677  * these rows could be copied as is into the database table.
678  * @throw SizeError is raised if the size of at least one row in new_rows
679  * is not equal to the number of translators in the DatabaseTable
680  * @InvalidArgument is raised if at least one element of new_row does
681  * not belong to the domain of its corresponding translator.*/
682  virtual void insertRows(const Matrix< DBTranslatedValue >& new_rows,
683  const DBVector< IsMissing >& rows_have_missing_vals) final;
684 
685  /// insert a set of new DBRows at the end of the database
686  /** The new rows passed in argument are supposed to come from an external
687  * database. So they must contain data for the ignored columns.
688  * @throw SizeError is raised if the vector of string cannot be inserted
689  * in the DatabaseTable because its size does not allow a matching with
690  * the columns of the DatabaseTable (taking into account the ignored
691  * columns) */
692  virtual void insertRows(Matrix< DBCell >&& new_rows) final;
693 
694  /// insert a set of new DBRows at the end of the database
695  /** The new rows passed in argument are supposed to come from an external
696  * database. So they must contain data for the ignored columns.
697  * @throw SizeError is raised if the vector of string cannot be inserted
698  * in the DatabaseTable because its size does not allow a matching with
699  * the columns of the DatabaseTable (taking into account the ignored
700  * columns) */
701  virtual void insertRows(const Matrix< DBCell >& new_rows) final;
702 
703  /// erase the content of the database, including the names of the variables
704  virtual void clear() final;
705 
706  // substitutes the kth translator by another one
707  /* The method checks that:
708  * 1/ it is possible to get back the original values of the database
709  * for the rows already translated.
710  * 2/ that the new translator is capable of translating these values.
711  *
712  * If both checks passed, then it replaces the kth translator
713  * by the one passed in arguments and retranslates with it the kth
714  * cell of all the rows already contained in the database */
715 
716  /// @}
717 
718 
719 #ifndef DOXYGEN_SHOULD_SKIP_THIS
720 
721  private:
722  /// the set of translators used to convert the strings into floats
723  DBTranslatorSet< ALLOC > _translators_;
724 
725  /// the set of ignored columns asked by the user
726  Set< std::size_t, ALLOC< std::size_t > > _ignored_cols_;
727 
728  /** @brief check that a row's values are compatible with those of the
729  * translators' variables */
730  bool _isRowCompatible_(const Row< DBTranslatedValue >& row) const;
731 
732  /** @brief returns the index corresponding either to the kth translator or
733  * to the first one that parses the kth column of the input dataset
734  *
735  * @warning if the translator does not exists, the function returns an
736  * index which is greater than the number of translators */
737  std::size_t _getKthIndex_(const std::size_t k, const bool k_is_input_col) const;
738 
739  /** @brief returns the indices corresponding either to the kth translator
740  * or to all those that parse the kth column of the input dataset
741  *
742  * @warning the indices are sorted by deacreasing order */
743  DBVector< std::size_t > _getKthIndices_(const std::size_t k, const bool k_is_input_col) const;
744 
745  /// a method to process the rows of the database in multithreading
746  /** The function tries to execute function/functor exec_func using one
747  * or several threads. If an exception is raised by at least one thread,
748  * then function undo_func is executed to undo what exec_func
749  * did, and the exception is rethrown.
750  *
751  * @param exec_func this should be a function/functor/lambda that
752  * takes 3 arguments: the first one is an std::size_t containing the
753  * index of the first row that it should process, the second argument is
754  * an std::size_t equal to 1 + the index of the last row processed, so that
755  * the processing is performed on [first,last). The last argument is an index of
756  * identifying the thread in which exec_func is performed (the first one has index 0,
757  * the second one Index 1, and so on. The return type of exec_func
758  * is a void. If a thread executing exec_func raises an exception, then
759  * before exiting, it should undo what it did.
760  * @param undo_func a Function/functor/lambda with the same
761  * prototype as exec_func. If a thread raises an exception, those that
762  * did not raise exceptions should undo what they did in order to restore
763  * the state that the database had before the execution of the thread. After
764  * calling undo_func, they should have restored this state.
765  */
766  template < typename Functor1, typename Functor2 >
767  void _threadProcessDatabase_(Functor1& exec_func, Functor2& undo_func);
768 
769 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
770  };
771 
772  } /* namespace learning */
773 
774 } /* namespace gum */
775 
776 /// always include the templated implementations
777 #include <agrum/tools/database/databaseTable_tpl.h>
778 
779 #endif /* GUM_DATABASE_TABLE_H */
virtual ~DatabaseTable()
destructor
virtual void insertRow(const Row< DBTranslatedValue > &new_row, const IsMissing contains_missing_data) final
insert a new row at the end of the database
virtual void insertRows(Matrix< DBCell > &&new_rows) final
insert a set of new DBRows at the end of the database
DatabaseTable(const DatabaseTable< ALLOC > &from)
copy constructor
virtual void ignoreColumn(const std::size_t k, const bool from_external_object=true) final
makes the database table ignore from now on the kth column of the input dataset or the column parsed ...
DBVector< std::size_t > domainSizes() const
returns the domain sizes of all the variables in the database table
const DBTranslatorSet< ALLOC > & translatorSet() const
returns the set of translators
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
virtual void insertRows(const Matrix< DBTranslatedValue > &new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
virtual void insertRow(const std::vector< std::string, ALLOC< std::string > > &new_row) final
insert a new row at the end of the database
DatabaseTable(DatabaseTable< ALLOC > &&from)
move constructor
DatabaseTable(const DatabaseTable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
void changeTranslator(const Variable &var, const std::size_t k, const bool k_is_input_col=false)
change the translator of a database column
bool needsReordering(const std::size_t k, const bool k_is_input_col=false) const
indicates whether a reordering is needed to sort the translations of the kth translator or those of t...
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, const bool unique_column=true)
insert a new translator into the database table
virtual DatabaseTable< ALLOC > * clone() const final
virtual copy constructor
virtual void insertRow(const Row< DBCell > &new_row) final
insert a new DBRow of DBCells at the end of the database
virtual const DBVector< std::size_t > inputColumns() const final
returns the set of columns of the original dataset that are present in the DatabaseTable ...
const DBTranslator< ALLOC > & translator(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth translator of the database table or the first one reading the kth column of th...
virtual void insertRow(Row< DBTranslatedValue > &&new_row, const IsMissing contains_missing_data) final
insert a new DBRow at the end of the database
std::size_t domainSize(const std::size_t k, const bool k_is_input_col=false) const
returns the domain size of the kth variable of the database table or of that of the first one corresp...
virtual const DBVector< std::size_t > ignoredColumns() const final
returns the set of columns of the original dataset that are ignored
DatabaseTable< ALLOC > & operator=(const DatabaseTable< ALLOC > &from)
copy operator
DatabaseTable(DatabaseTable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual void insertRow(Row< DBCell > &&new_row) final
insert a new DBRow of DBCells at the end of the database
const Variable & variable(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth variable of the database table or the first one corresponding to the kth colum...
virtual void insertRows(Matrix< DBTranslatedValue > &&new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, std::vector< std::string, XALLOC< std::string > > missing_symbols, const bool unique_column=true)
insert a new translator into the database table
The class representing a tabular database as used by learning tasks.
virtual void clear() final
erase the content of the database, including the names of the variables
virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > > &names, const bool from_external_object=true) final
sets the names of the variables
DatabaseTable< ALLOC > & operator=(DatabaseTable< ALLOC > &&from)
move constructor
virtual void insertRows(const Matrix< DBCell > &new_rows) final
insert a set of new DBRows at the end of the database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DatabaseTable(const MissingValType< XALLOC > &missing_symbols, const DBTranslatorSet< ALLOC > &translators=DBTranslatorSet< ALLOC >(), const allocator_type &alloc=allocator_type())
default constructor
void reorder(const std::size_t k, const bool k_is_input_col=false)
performs a reordering of the kth translator or of the first translator parsing the kth column of the ...
void eraseTranslators(const std::size_t k, const bool k_is_input_col=false)
erases either the kth translator or all those parsing the kth column of the input dataset ...
virtual DatabaseTable< ALLOC > * clone(const allocator_type &alloc) const final
virtual copy constructor with a given allocator
void reorder()
performs a reordering of all the columns