aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
databaseTable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class representing a tabular database stored in RAM
24  *
25  * Class DatabaseTable represents a tabular database that stores in the
26  * computer's random access memory (RAM) its content as a vector of DBRows
27  * of DBTranslatedValue instances.
28  *
29  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
30  */
31 #ifndef GUM_DATABASE_TABLE_H
32 #define GUM_DATABASE_TABLE_H
33 
34 #include <numeric>
35 #include <algorithm>
36 #include <functional>
37 #include <exception>
38 #include <vector>
39 
40 #include <agrum/agrum.h>
41 #include <agrum/tools/core/math/math_utils.h>
42 #include <agrum/tools/core/set.h>
43 #include <agrum/tools/core/thread.h>
44 #include <agrum/tools/database/DBCell.h>
45 #include <agrum/tools/database/DBRow.h>
46 #include <agrum/tools/database/DBTranslatedValue.h>
47 #include <agrum/tools/database/IDatabaseTable.h>
48 #include <agrum/tools/database/DBTranslatorSet.h>
49 #include <agrum/tools/database/DBTranslator4ContinuousVariable.h>
50 
51 namespace gum {
52 
53  namespace learning {
54 
55  /** @class DatabaseTable
56  * @brief The class representing a tabular database as used by learning tasks.
57  * @headerfile databaseTable.h <agrum/tools/database/databaseTable.h>
58  * @ingroup learning_database
59  *
60  * Class DatabaseTable represents a tabular database that stores in the
61  * computer's random access memory (RAM) its content as a vector of DBRows
62  * of DBTranslatedValue instances. This class is very well suited for fast
63  * learning algorithms.
64  *
65  * @par Usage example:
66  * @code
67  * // create the database from a CSV. This is not compulsory for
68  * // DatabaseTable instances, but this is how we usually create
69  * // DatabaseTable instances
70  * gum::learning::DBInitializerFromCSV<> initializer ( "asia.csv" );
71  * const auto& var_names = initializer.variableNames ();
72  * gum::learning::DBTranslatorSet<> translator_set;
73  * gum::learning::DBTranslator4LabelizedVariable<> translator;
74  * for ( std::size_t i = 0; i < var_names.size(); ++i )
75  * translator_set.insertTranslator ( translator, i );
76  * gum::learning::DatabaseTable<> database ( translator_set );
77  * database.setVariableNames( initializer.variableNames () );
78  *
79  * // here, database contains the content of the asia.csv file.
80  * // determine how many columns and rows the database contains
81  * std::size_t nb_rows = database.nbRows();
82  * std::size_t nb_cols = database.nbVariables ();
83  *
84  * // manually add a new row into the database
85  * std::vector<std::string> row( 8, "toto" ); // asia has 8 columns
86  * database.insertRow ( row );
87  * gum::learning::DBRow<gum::learning::DBTranslatedValue>
88  * dbrow ( 8, gum::learning::DBTranslatedValue { std::size_t(0) } );
89  * database.insertRow ( dbrow );
90  * // insert 4 rows in a single call:
91  * database.insertRows(
92  * std::vector<gum::learning::DBRow<gum::learning::DBTranslatedValue>>
93  * ( 4, dbrow ) );
94  *
95  * // erase some rows
96  * database.eraseRow ( 12 ); // erase the 13th row of the database
97  * database.eraseFirstRow (); // erase the first row of the database
98  * database.eraseLastRow (); // erase the last row of the database
99  * database.eraseFirstRows ( 2 ); // erase the first two rows
100  * database.eraseLastRows ( 3 ); // erase the last three rows
101  * database.eraseRows ( 2,4 ); // erase rows indexed from 2 to 4 (excluded)
102  *
103  * // parse the content of the database, the usual way
104  * for ( const auto& dbrow : database )
105  * std::cout << dbrow.row() << " weight: " << dbrow.weight() << std::endl;
106  *
107  * // ignore some columns of the database, i.e., remove them
108  * database.ignoreColumn ( 3 ); // remove the column X3 of the CSV file
109  * // now, the database contains columns 0, 1, 2, 4, 5, 6, 7 of the
110  * // CSV file. If we wish to remove Column X5 of the CSV file:
111  * database.ignoreColumn ( 5 ); // remove the column X5 of the CSV file
112  * // now, the database contains columns 0, 1, 2, 4, 6, 7 of the CSV file.
113  * // if we wish to remove the 5th column of the DatabaseTable, i.e.,
114  * // column #4 of the CSV, either we determine that this actually correspond
115  * // to column X6 of the CSV and we use database.ignoreColumn ( 6 ) or
116  * // we call:
117  * database.ignoreColumn ( 4, false ); // false => 4 = the 5th column of
118  * // the DatabaseTable, not the 5th column/variable of the CSV file
119  * // (remember that all column numbers start from 0).
120  *
121  * // display the columns of the CSV that were ignored and those that
122  * // were kept:
123  * std::vector<std::size_t> ignored_cols = database.ignoredColumns ();
124  * std::vector<std::size_t> kept_cols = database.inputColumns ();
125  *
126  * // parse the content of the database using handlers
127  * typename gum::learning::DatabaseTable<>::HandlerSafe handler( database );
128  * typename gum::learning::DatabaseTable<>::Handler uhandler( database );
129  * // by default, the handlers range over the whole database
130  *
131  * // change the range of rows handled by the DBHandler
132  * std::cout << handler.setRange ( 1, 40 ); // now parses rows [1,40)
133  * std::cout << handler.size (); // displays 39: rows 1,...,39
134  * std::cout << handler.DBSize (); // shows the number of rows in the database
135  * std::cout << handler.numRow (); // displays 0: the handler currently
136  * // points on the first row of its managed area [1,40)
137  *
138  * // move the handler to the next row
139  * handler.nextRow();
140  * std::cout << handler.numRow (); // displays 1: the handler points now
141  * // on the second row of its managed area. This corresponds to the third
142  * // DBRow of the database since the range of handler is [1,40)
143  * ++handler; // move again to the next row
144  * std::cout << handler.numRow (); // displays 2
145  * handler += 4; // advances the pointer by 4 rows
146  * std::cout << handler.numRow (); // displays 6
147  *
148  * // get the DBRow pointed to by the handler: this is the 7th DBRow
149  * // of the database
150  * const auto& xrow7 = handler.row (); // get the DBRow, unsafe version
151  * const auto& yrow7 = handler.rowSafe (); // get the DBRow, safe version
152  * const std::vector<gum::learning::DBCell>& xrow = xrow7.row ();
153  * const double xweight = xrow27.weight ();
154  *
155  * // another way to access the row
156  * const auto& zrow7 = *handler; // get the DBRow, unsafe version
157  *
158  * // check whether there exist other rows managed by the handler after
159  * // the current row
160  * bool has_rows = handler.hasRows (); // true: there remains 33 rows
161  *
162  * // makes the handler point again on the 2nd row of the database
163  * handler.reset (); // the handler points to the beginning of its area
164  * std::cout << handler.numRow (); // displays 0: the handler currently
165  * // points on the first row of its managed area [1,40)
166  *
167  * // see the variables' names, i.e., the names of the database's columns
168  * const auto& vars = handler.variableNames();
169  *
170  * // parse all the rows managed
171  * handler.reset ();
172  * for ( auto end = handler.end (); handler != end; ++handler )
173  * std::cout << handler.row ().weight () << std::endl;
174  *
175  * // another possibility:
176  * for ( const auto& row : handler )
177  * std::cout << row.weight () << std::endl;
178  *
179  *
180  * // clear the content of the database and update the database's
181  * // handlers
182  * database.clear ();
183  * @endcode
184  */
185  template < template < typename > class ALLOC = std::allocator >
187  public:
188  /// the type for the vectors used in the DatabaseTable
189  template < typename TX_DATA >
190  using DBVector = std::vector< TX_DATA, ALLOC< TX_DATA > >;
191 
192  /// a row of the database
193  template < typename TX_DATA >
194  using Row = DBRow< TX_DATA, ALLOC >;
195 
196  /// the type for the matrices stored into the database
197  template < typename TX_DATA >
198  using Matrix = std::vector< DBRow< TX_DATA, ALLOC >,
200 
201  template < template < typename > class XALLOC >
202  using MissingValType = std::vector< std::string, XALLOC< std::string > >;
203 
204  /// the unsafe handler type
206 
207  /// the safe handler type
208  using HandlerSafe =
210 
211  using IsMissing =
213 
214  /// Types for STL compliance.
215  /// @{
218  using const_reference = const value_type&;
219  using pointer = value_type*;
220  using const_pointer = const value_type*;
221  using size_type = std::size_t;
222  using difference_type = std::ptrdiff_t;
223  using iterator = Handler;
226  /// @}
227 
228 
229  // ##########################################################################
230  /// @name Constructors / Destructors
231  // ##########################################################################
232  /// @{
233 
234  /// default constructor
235  template < template < typename > class XALLOC >
236  DatabaseTable(const MissingValType< XALLOC >& missing_symbols,
237  const DBTranslatorSet< ALLOC >& translators
238  = DBTranslatorSet< ALLOC >(),
239  const allocator_type& alloc = allocator_type());
240 
241  /// default constructor
242  DatabaseTable(const DBTranslatorSet< ALLOC >& translators
243  = DBTranslatorSet< ALLOC >(),
244  const allocator_type& alloc = allocator_type());
245 
246  /// copy constructor
247  DatabaseTable(const DatabaseTable< ALLOC >& from);
248 
249  /// copy constructor with a given allocator
250  DatabaseTable(const DatabaseTable< ALLOC >& from,
251  const allocator_type& alloc);
252 
253  /// move constructor
254  DatabaseTable(DatabaseTable< ALLOC >&& from);
255 
256  /// move constructor with a given allocator
257  DatabaseTable(DatabaseTable< ALLOC >&& from, const allocator_type& alloc);
258 
259  /// virtual copy constructor
260  virtual DatabaseTable< ALLOC >* clone() const final;
261 
262  /// virtual copy constructor with a given allocator
263  virtual DatabaseTable< ALLOC >*
264  clone(const allocator_type& alloc) const final;
265 
266  /// destructor
267  virtual ~DatabaseTable();
268 
269  /// @}
270 
271  // ##########################################################################
272  /// @name Operators
273  // ##########################################################################
274  /// @{
275 
276  /// copy operator
278 
279  /// move constructor
281 
282  /// @}
283 
284 
285  // ##########################################################################
286  /// @name Accessors / Modifiers
287  // ##########################################################################
288  /// @{
289 
290  /// insert a new translator into the database table
291  /** @param translator This translator is copied into the DatabaseTable
292  * @param input_column indicates which column in the original dataset
293  * (usually a CSV file) the translator will read
294  * @param unique_column indicates whether the input column can be read by
295  * several translators.
296  * @return the index of the translator within the set of translators
297  * @throws OperationNotAllowed if the input column is marked as ignored
298  * @throws DuplicateElement if there already exists a translator
299  * reading the input column passed in argument, and if the unique_column
300  * is set to true
301  * @warning if the database is not empty, i.e., it contains some records,
302  * all the column of the database corresponding to the new translator is
303  * filled with missing values.
304  */
305  std::size_t insertTranslator(const DBTranslator< ALLOC >& translator,
306  const std::size_t input_column,
307  const bool unique_column = true);
308 
309  /// insert a new translator into the database table
310  /** @param var the variable that will be contained into the translator
311  * @param input_column indicates which column in the original dataset
312  * (usually a CSV file) the translator will read
313  * @param unique_column indicates whether the input column can be read by
314  * several translators
315  * @param missing_symbols the set of symbols in the database
316  * representing missing values
317  * @return the index of the translator within the set of translators
318  * @throws OperationNotAllowed if the input column is marked as ignored
319  * @throws DuplicateElement if there already exists a translator
320  * reading the input column passed in argument, and if the unique_column
321  * is set to true
322  * @throws if the database is not empty, i.e., it contains some records,
323  * all the columns of the database corresponding to the new translator
324  * should be filled with missing values, which is impossible since we do
325  * not know which symbols correspond to missing values. Therefore, we
326  * raise a MissingValueInDatabase exception. If you do not want such a
327  * behavior, use method insertTranslator in which you specify the set of
328  * missing symbols.
329  */
330  std::size_t insertTranslator(const Variable& var,
331  const std::size_t input_column,
332  const bool unique_column = true);
333 
334  /// insert a new translator into the database table
335  /** @param var the variable that will be contained into the translator
336  * @param input_column indicates which column in the original dataset
337  * (usually a CSV file) the translator will read
338  * @param unique_column indicates whether the input column can be read by
339  * several translators
340  * @param missing_symbols the set of symbols in the database
341  * representing missing values
342  * @return the index of the translator within the set of translators
343  * @throws OperationNotAllowed if the input column is marked as ignored
344  * @throws DuplicateElement if there already exists a translator
345  * reading the input column passed in argument, and if the unique_column
346  * is set to true
347  * @warning if the database is not empty, i.e., it contains some records,
348  * all the column of the database corresponding to the new translator is
349  * filled with missing values.
350  */
351  template < template < typename > class XALLOC >
352  std::size_t insertTranslator(
353  const Variable& var,
354  const std::size_t input_column,
355  std::vector< std::string, XALLOC< std::string > > missing_symbols,
356  const bool unique_column = true);
357 
358  /** @brief erases either the kth translator or all those parsing the kth
359  * column of the input dataset
360  *
361  * Translators read an input dataset that is not necessarily the same as
362  * the content of the DatabaseTable. For instance, a CSV may contain 10
363  * columns, but if a DatabaseTable only contains two translators reading
364  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
365  * columns. When k_is_input_col is set to false, Parameter k passed in
366  * argument corresponds to either 0 or 1, i.e., to the index of one of
367  * these two output columns. When k_is_input_col is set to true, the
368  * translators to be erased are all those that parse the kth column of the
369  * input database.
370  * @warning if the translator does not exists, nothing is done. In
371  * particular, no exception is raised. */
372  void eraseTranslators(const std::size_t k,
373  const bool k_is_input_col = false);
374 
375  /// returns the set of translators
376  const DBTranslatorSet< ALLOC >& translatorSet() const;
377 
378  /** @brief returns either the kth translator of the database table or the
379  * first one reading the kth column of the input database
380  *
381  * Translators read an input dataset that is not necessarily the same as
382  * the content of the DatabaseTable. For instance, a CSV may contain 10
383  * columns, but if a DatabaseTable only contains two translators reading
384  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
385  * columns. When k_is_input_col is set to false, Parameter k passed in
386  * argument corresponds to either 0 or 1, i.e., the index of one of these
387  * two columns. When k_is_input_col is set to true, the translator returned
388  * is the first one that parses the kth column of the input database.
389  * @throw UndefinedElement is raised if there is no translator
390  * corresponding to k. */
391  const DBTranslator< ALLOC >&
392  translator(const std::size_t k, const bool k_is_input_col = false) const;
393 
394  /** @brief returns either the kth variable of the database table or the
395  * first one corresponding to the kth column of the input database
396  *
397  * Translators read an input dataset that is not necessarily the same as
398  * the content of the DatabaseTable. For instance, a CSV may contain 10
399  * columns, but if a DatabaseTable only contains two translators reading
400  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
401  * columns. When k_is_input_col is set to false, Parameter k passed in
402  * argument corresponds to either 0 or 1, i.e., the index of one of these
403  * two columns. When k_is_input_col is set to true, the variable is that
404  * of the translator that parses the kth column of the input database.
405  * @throw UndefinedElement is raised if there is no translator
406  * corresponding to k. */
407  const Variable& variable(const std::size_t k,
408  const bool k_is_input_col = false) const;
409 
410  /// sets the names of the variables
412 
413  /// sets the names of the variables
414  /** This method can be called in two different ways: either the names
415  * correspond precisely to the columns stored into the database table
416  * (in this case, parameter from_external_object is equal to false),
417  * or they correspond to the columns of an external database (e.g., a
418  * CSV file) from which we potentially excluded some columns and,
419  * consequently, the latter should not be taken into account (in this
420  * case, parameter from_external_object is equal to true). As an
421  * example, imagine that the database table is created from a CSV file
422  * with 5 columns named X0, X1, X2, X3 and X4 respectively. Suppose that
423  * we asked the database table to ignore columns X1 and X3. Then
424  * setVariableNames( { "X0", "X1", "X2", "X3", "X4" }, true ) will
425  * set the columns of the database table as { "X0", "X2", "X4" }. The
426  * same result could be obtained by executing
427  * setVariableNames( { "X0", "X2", "X4" }, false ), which specifies
428  * directly the set of names to retain in the database table.
429  * @param names the names of all the columns, including the ignored
430  * columns if from_external_object is set to true, else excluding
431  * them (i.e., this should precisely correspond to the columns stored
432  * into the database table).
433  * @param from_external_object a Boolean indicating whether parameter
434  * names includes the columns ignored by the database table (true) or
435  * not (false).
436  * @throw SizeError is raised if the names passed in arguments cannot be
437  * assigned to the columns of the DatabaseTable because the size of their
438  * vector is inadequate. */
439  virtual void setVariableNames(
440  const std::vector< std::string, ALLOC< std::string > >& names,
441  const bool from_external_object = true) final;
442 
443  /** @brief makes the database table ignore from now on the kth column of
444  * the input dataset or the column parsed by the kth translator
445  *
446  * This method can be called in two different ways: either k refers to
447  * the current kth column of the database table (in this case parameter
448  * from_external_object is set to false), or k corresponds to the kth
449  * column of an original database used to fill the database table
450  * (in this case from_external_object is set to true). Depending on
451  * from_external_object's value, the ignored columns may differ. As an
452  * example, imagine that the database table is created from a CSV file
453  * with 5 columns named X0, X1, X2, X3 and X4 respectivly. Then a call to
454  * ignoreColumn ( 1, true ) will exclude column X1 from the database table.
455  * As a result, the database table columns are X0, X2, X3 and X4.
456  * Therefore, subsequently calling ignoreColumn ( 1, false ) will result
457  * in excluding X2 since X2 is the 2nd column (columns are indexed
458  * starting from 0). So, now the database table's columns are
459  * X0, X3 and X4. If, now, we call ignoreColumn ( 3, true ), this will
460  * remove column X3 because, in the original database, X3 was the 4th
461  * column.
462  *
463  * The method also erases all the translators corresponding to column k,
464  * if any. If the DatabaseTable contains some rows, then their column
465  * corresponding to k is removed. If the resulting DatabaseTable
466  * contains only empty rows, then those are removed.
467  *
468  * @param k the column to remove. See Method setVariableNames for a
469  * detailed description on how k is computed.
470  * @param from_external_object indicates whether k refers to the kth
471  * column of an original external database (true) or to the current kth
472  * column of the DatabaseTable (false).
473  * @throw UndefinedElement is raised if k refers to the position of a
474  * translator that does not exist (k >= number of translators). */
475  virtual void ignoreColumn(const std::size_t k,
476  const bool from_external_object = true) final;
477 
478  /// returns the set of columns of the original dataset that are ignored
479  /** In this vector, all the column indices greater than or equal to its
480  * last element are also ignored. */
481  virtual const DBVector< std::size_t > ignoredColumns() const final;
482 
483  /** @brief returns the set of columns of the original dataset that are
484  * present in the DatabaseTable */
485  virtual const DBVector< std::size_t > inputColumns() const final;
486 
487  /** @brief returns the domain size of the kth variable of the database
488  * table or of that of the first one corresponding to the kth column of
489  * the input database
490  *
491  * Translators read an input dataset that is not necessarily the same as
492  * the content of the DatabaseTable. For instance, a CSV may contain 10
493  * columns, but if a DatabaseTable only contains two translators reading
494  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
495  * columns. When k_is_input_col is set to false, Parameter k passed in
496  * argument corresponds to either 0 or 1, i.e., the index of one of these
497  * two columns. When k_is_input_col is set to true, the variable is that
498  * of the translator that parses the kth column of the input database.
499  * @throw UndefinedElement is raised if there is no translator
500  * corresponding to k. */
501  std::size_t domainSize(const std::size_t k,
502  const bool k_is_input_col = false) const;
503 
504  /// returns the domain sizes of all the variables in the database table
505  DBVector< std::size_t > domainSizes() const;
506 
507  /** @brief indicates whether a reordering is needed to sort the translations
508  * of the kth translator or those of the first translator parsing the kth
509  * column
510  *
511  * For a given translator, if the strings represented by the translations
512  * are only numbers, the translations are considered to be sorted if and
513  * only if they are sorted by increasing number. If the strings do not
514  * only represent numbers, then translations are considered to be sorted
515  * if and only if they are sorted lexicographically.
516  *
517  * When constructing dynamically its dictionary, the translator may
518  * assign wrong DBTranslatedValue values to strings. For instance, a
519  * translator reading sequentially integer strings 4, 1, 3, may map
520  * 4 into DBTranslatedValue{std::size_t(0)},
521  * 1 into DBTranslatedValue{std::size_t(1)} and
522  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
523  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
524  * a domain specified with increasing values. This requires a
525  * reordering. Method needsReodering() returns a Boolean indicating
526  * whether such a reordering should be performed or whether the current
527  * order is OK.
528  *
529  * Translators read an input dataset that is not necessarily the same as
530  * the content of the DatabaseTable. For instance, a CSV may contain 10
531  * columns, but if a DatabaseTable only contains two translators reading
532  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
533  * columns. When k_is_input_col is set to false, Parameter k passed in
534  * argument corresponds to either 0 or 1, i.e., the index of one of these
535  * two columns. When k_is_input_col is set to true, the translator to be
536  * reordered is that which parses the kth column of the input database.
537  * @throw UndefinedElement is raised if there is no translator
538  * corresponding to k. */
539  bool needsReordering(const std::size_t k,
540  const bool k_is_input_col = false) const;
541 
542  /** @brief performs a reordering of the kth translator or
543  * of the first translator parsing the kth column of the input database
544  *
545  * For a given translator, if the strings represented by the translations
546  * are only numbers, the translations are considered to be sorted if and
547  * only if they are sorted by increasing number. If the strings do not
548  * only represent numbers, then translations are considered to be sorted
549  * if and only if they are sorted lexicographically.
550  *
551  * Translators read an input dataset that is not necessarily the same as
552  * the content of the DatabaseTable. For instance, a CSV may contain 10
553  * columns, but if a DatabaseTable only contains two translators reading
554  * columns 3 and 5 respectively, then the DatabaseTable only contains 2
555  * columns. When k_is_input_col is set to false, Parameter k passed in
556  * argument corresponds to either 0 or 1, i.e., the index of one of these
557  * two columns. When k_is_input_col is set to true, the translator to be
558  * reordered is that which parses the kth column of the input database.
559  * @throw UndefinedElement is raised if there is no translator
560  * corresponding to k. */
561  void reorder(const std::size_t k, const bool k_is_input_col = false);
562 
563  /// performs a reordering of all the columns
564  /** For a given translator, if the strings represented by the translations
565  * are only numbers, the translations are considered to be sorted if and
566  * only if they are sorted by increasing number. If the strings do not
567  * only represent numbers, then translations are considered to be sorted
568  * if and only if they are sorted lexicographically. */
569  void reorder();
570 
571  /// insert a new row at the end of the database
573 
574  /// insert a new row at the end of the database
575  /** The new_row passed in argument is supposed to come from an external
576  * database. So it must contain data for the ignored columns.
577  * @throws SizeError is raised if the vector of string cannot be inserted
578  * in the DatabaseTable because its size does not allow a matching with
579  * the columns of the DatabaseTable (taking into account the ignored
580  * columns)
581  * @throws UnknownLabelInDatabase is raised if the translation of an
582  * element in the new row cannot be found and the corresponding translator
583  * is not in an editable dictionary mode.
584  * @throws SizeError is raised if the number of entries in the dictionary
585  * of a translator has already reached its maximum.
586  * @throws OperationNotAllowed exception is raised if the translation of
587  * an element in new_row cannot be found and the insertion of the string
588  * into the corresponding translator's dictionary fails because it would
589  * induce incoherent behavior (e.g., a DBTranslator4ContinuousVariable
590  * that contains a variable whose domain is [x,y] as well as a missing
591  * value symbol z \f$\in\f$ [x,y]).
592  * @throws TypeError is raised if the translation of an element in new_row
593  * cannot be found and the insertion of the string into the translator's
594  * dictionary fails due to str being impossible to be converted into an
595  * appropriate type.
596  */
597  virtual void insertRow(
598  const std::vector< std::string, ALLOC< std::string > >& new_row) final;
599 
600  /// insert a new DBRow at the end of the database
601  /** Unlike methods insertRow for data whose type is different from
602  * DBTranslatedValue, this method assumes that the new row passed in
603  * argument does not contain any data of the ignored columns. So,
604  * basically, it could be copied as is into the database table.
605  * @throw SizeError is raised if the size of the new_row is not equal to
606  * the number of translators of the DatabaseTable
607  * @InvalidArgument is raised if at least one element of new_row does
608  * not belong to the domain of its corresponding translator.
609  */
610  virtual void insertRow(Row< DBTranslatedValue >&& new_row,
611  const IsMissing contains_missing_data) final;
612 
613  /// insert a new row at the end of the database
614  /** Unlike methods insertRow for data whose type is different from
615  * DBTranslatedValue, this method assumes that the new row passed in
616  * argument does not contain any data of the ignored columns. So,
617  * basically, it could be copied as is into the database table.
618  * @throw SizeError is raised if the size of the new_row is not equal to
619  * the number of translators of the DatabaseTable
620  * @InvalidArgument is raised if at least one element of new_row does
621  * not belong to the domain of its corresponding translator.
622  */
623  virtual void insertRow(const Row< DBTranslatedValue >& new_row,
624  const IsMissing contains_missing_data) final;
625 
626  /// insert a new DBRow of DBCells at the end of the database
627  /** The new_row passed in argument is supposed to come from an external
628  * database. So it must contain data for the ignored columns.
629  * @throw SizeError is raised if the vector of string cannot be inserted
630  * in the DatabaseTable because its size does not allow a matching with
631  * the columns of the DatabaseTable (taking into account the ignored
632  * columns) */
633  virtual void insertRow(const Row< DBCell >& new_row) final;
634 
635  /// insert a new DBRow of DBCells at the end of the database
636  /** The new_row passed in argument is supposed to come from an external
637  * database. So it must contain data for the ignored columns.
638  * @throw SizeError is raised if the vector of string cannot be inserted
639  * in the DatabaseTable because its size does not allow a matching with
640  * the columns of the DatabaseTable (taking into account the ignored
641  * columns) */
642  virtual void insertRow(Row< DBCell >&& new_row) final;
643 
644  /// insert a set of new DBRows at the end of the database
645  /** Unlike methods insertRows for data whose type is different from
646  * DBTranslatedValue, this method assumes that the new rows passed in
647  * argument do not contain any data of the ignored columns. So, basically,
648  * these rows could be copied as is into the database table.
649  * @throw SizeError is raised if the size of at least one row in new_rows
650  * is not equal to the number of translators in the DatabaseTable
651  * @InvalidArgument is raised if at least one element of new_row does
652  * not belong to the domain of its corresponding translator.
653  */
654  virtual void
655  insertRows(Matrix< DBTranslatedValue >&& new_rows,
656  const DBVector< IsMissing >& rows_have_missing_vals) final;
657 
658  /// insert a set of new DBRows at the end of the database
659  /** Unlike methods insertRows for data whose type is different from
660  * DBTranslatedValue, this method assumes that the new rows passed in
661  * argument do not contain any data of the ignored columns. So, basically,
662  * these rows could be copied as is into the database table.
663  * @throw SizeError is raised if the size of at least one row in new_rows
664  * is not equal to the number of translators in the DatabaseTable
665  * @InvalidArgument is raised if at least one element of new_row does
666  * not belong to the domain of its corresponding translator.*/
667  virtual void
668  insertRows(const Matrix< DBTranslatedValue >& new_rows,
669  const DBVector< IsMissing >& rows_have_missing_vals) final;
670 
671  /// insert a set of new DBRows at the end of the database
672  /** The new rows passed in argument are supposed to come from an external
673  * database. So they must contain data for the ignored columns.
674  * @throw SizeError is raised if the vector of string cannot be inserted
675  * in the DatabaseTable because its size does not allow a matching with
676  * the columns of the DatabaseTable (taking into account the ignored
677  * columns) */
678  virtual void insertRows(Matrix< DBCell >&& new_rows) final;
679 
680  /// insert a set of new DBRows at the end of the database
681  /** The new rows passed in argument are supposed to come from an external
682  * database. So they must contain data for the ignored columns.
683  * @throw SizeError is raised if the vector of string cannot be inserted
684  * in the DatabaseTable because its size does not allow a matching with
685  * the columns of the DatabaseTable (taking into account the ignored
686  * columns) */
687  virtual void insertRows(const Matrix< DBCell >& new_rows) final;
688 
689  /// erase the content of the database, including the names of the variables
690  virtual void clear() final;
691 
692  // substitutes the kth translator by another one
693  /* The method checks that:
694  * 1/ it is possible to get back the original values of the database
695  * for the rows already translated.
696  * 2/ that the new translator is capable of translating these values.
697  *
698  * If both checks passed, then it replaces the kth translator
699  * by the one passed in arguments and retranslates with it the kth
700  * cell of all the rows already contained in the database */
701 
702  /// @}
703 
704 
705 #ifndef DOXYGEN_SHOULD_SKIP_THIS
706 
707  private:
708  /// the set of translators used to convert the strings into floats
709  DBTranslatorSet< ALLOC > translators__;
710 
711  /// the set of ignored columns asked by the user
712  Set< std::size_t, ALLOC< std::size_t > > ignored_cols__;
713 
714  /** @brief check that a row's values are compatible with those of the
715  * translators' variables */
716  bool isRowCompatible__(const Row< DBTranslatedValue >& row) const;
717 
718  /** @brief returns the index corresponding either to the kth translator or
719  * to the first one that parses the kth column of the input dataset
720  *
721  * @warning if the translator does not exists, the function returns an
722  * index which is greater than the number of translators */
723  std::size_t getKthIndex__(const std::size_t k,
724  const bool k_is_input_col) const;
725 
726  /** @brief returns the indices corresponding either to the kth translator
727  * or to all those that parse the kth column of the input dataset
728  *
729  * @warning the indices are sorted by deacreasing order */
730  DBVector< std::size_t > getKthIndices__(const std::size_t k,
731  const bool k_is_input_col) const;
732 
733  /// a method to process the rows of the database in multithreading
734  /** The function tries to execute function/functor exec_func using one
735  * or several threads. If an exception is raised by at least one thread,
736  * then function undo_func is executed to undo what exec_func
737  * did, and the exception is rethrown.
738  *
739  * @param exec_func this should be a function/functor/lambda that
740  * takes 2 arguments: the first one is an std::size_t containing the
741  * index of the first row that it should process, the second argument is
742  * an std::size_t equal to 1 + the index of the last row processed (so
743  * the processing is performed on [first,last). The return type of exec_func
744  * is a void. If a thread executing exec_func raises an exception, then
745  * before exiting, it should undo what it did.
746  * @param undo_func a Function/functor/lambda with the same
747  * prototype as exec_func. If a thread raises an exception, those that
748  * did not raise exceptions should undo what they did in order to restore
749  * the state that the database had before the execution of the thread. After
750  * calling undo_func, they should have restored this state.
751  */
752  template < typename Functor1, typename Functor2 >
753  void threadProcessDatabase__(Functor1& exec_func, Functor2& undo_func);
754 
755 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
756  };
757 
758  } /* namespace learning */
759 
760 } /* namespace gum */
761 
762 /// always include the templated implementations
763 #include <agrum/tools/database/databaseTable_tpl.h>
764 
765 #endif /* GUM_DATABASE_TABLE_H */
virtual ~DatabaseTable()
destructor
virtual void insertRow(const Row< DBTranslatedValue > &new_row, const IsMissing contains_missing_data) final
insert a new row at the end of the database
virtual void insertRows(Matrix< DBCell > &&new_rows) final
insert a set of new DBRows at the end of the database
DatabaseTable(const DatabaseTable< ALLOC > &from)
copy constructor
virtual void ignoreColumn(const std::size_t k, const bool from_external_object=true) final
makes the database table ignore from now on the kth column of the input dataset or the column parsed ...
DBVector< std::size_t > domainSizes() const
returns the domain sizes of all the variables in the database table
const DBTranslatorSet< ALLOC > & translatorSet() const
returns the set of translators
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
virtual void insertRows(const Matrix< DBTranslatedValue > &new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
virtual void insertRow(const std::vector< std::string, ALLOC< std::string > > &new_row) final
insert a new row at the end of the database
DatabaseTable(DatabaseTable< ALLOC > &&from)
move constructor
DatabaseTable(const DatabaseTable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
bool needsReordering(const std::size_t k, const bool k_is_input_col=false) const
indicates whether a reordering is needed to sort the translations of the kth translator or those of t...
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, const bool unique_column=true)
insert a new translator into the database table
virtual DatabaseTable< ALLOC > * clone() const final
virtual copy constructor
virtual void insertRow(const Row< DBCell > &new_row) final
insert a new DBRow of DBCells at the end of the database
virtual const DBVector< std::size_t > inputColumns() const final
returns the set of columns of the original dataset that are present in the DatabaseTable ...
const DBTranslator< ALLOC > & translator(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth translator of the database table or the first one reading the kth column of th...
virtual void insertRow(Row< DBTranslatedValue > &&new_row, const IsMissing contains_missing_data) final
insert a new DBRow at the end of the database
std::size_t domainSize(const std::size_t k, const bool k_is_input_col=false) const
returns the domain size of the kth variable of the database table or of that of the first one corresp...
virtual const DBVector< std::size_t > ignoredColumns() const final
returns the set of columns of the original dataset that are ignored
DatabaseTable< ALLOC > & operator=(const DatabaseTable< ALLOC > &from)
copy operator
DatabaseTable(DatabaseTable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual void insertRow(Row< DBCell > &&new_row) final
insert a new DBRow of DBCells at the end of the database
const Variable & variable(const std::size_t k, const bool k_is_input_col=false) const
returns either the kth variable of the database table or the first one corresponding to the kth colum...
virtual void insertRows(Matrix< DBTranslatedValue > &&new_rows, const DBVector< IsMissing > &rows_have_missing_vals) final
insert a set of new DBRows at the end of the database
std::size_t insertTranslator(const Variable &var, const std::size_t input_column, std::vector< std::string, XALLOC< std::string > > missing_symbols, const bool unique_column=true)
insert a new translator into the database table
The class representing a tabular database as used by learning tasks.
virtual void clear() final
erase the content of the database, including the names of the variables
virtual void setVariableNames(const std::vector< std::string, ALLOC< std::string > > &names, const bool from_external_object=true) final
sets the names of the variables
DatabaseTable< ALLOC > & operator=(DatabaseTable< ALLOC > &&from)
move constructor
virtual void insertRows(const Matrix< DBCell > &new_rows) final
insert a set of new DBRows at the end of the database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DatabaseTable(const MissingValType< XALLOC > &missing_symbols, const DBTranslatorSet< ALLOC > &translators=DBTranslatorSet< ALLOC >(), const allocator_type &alloc=allocator_type())
default constructor
void reorder(const std::size_t k, const bool k_is_input_col=false)
performs a reordering of the kth translator or of the first translator parsing the kth column of the ...
void eraseTranslators(const std::size_t k, const bool k_is_input_col=false)
erases either the kth translator or all those parsing the kth column of the input dataset ...
virtual DatabaseTable< ALLOC > * clone(const allocator_type &alloc) const final
virtual copy constructor with a given allocator
void reorder()
performs a reordering of all the columns