aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBTranslatorSet.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief A class for storing several translators
24  *
25  * When learning Bayesian networks, the records of the train dataset are
26  * used to construct contingency tables that are either exploited in
27  * statistical conditional independence tests or in scores. In both cases,
28  * the values observed in the records must be translated into indices in
29  * the finite domain of the corresponding random variables. DBTranslator
30  * instances are used for this purpose. To make the parsing of all the columns
31  * of the dataset easier, all the DBTranslator instances used are gathered
32  * into a single DBRowTranslatorSet.
33  *
34  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
35  */
36 #ifndef GUM_LEARNING_DB_TRANSLATOR_SET_H
37 #define GUM_LEARNING_DB_TRANSLATOR_SET_H
38 
39 #include <vector>
40 #include <limits>
41 #include <type_traits>
42 
43 #include <agrum/agrum.h>
44 #include <agrum/tools/database/DBTranslator.h>
45 #include <agrum/tools/database/DBTranslator4LabelizedVariable.h>
46 #include <agrum/tools/database/DBTranslator4DiscretizedVariable.h>
47 #include <agrum/tools/database/DBTranslator4RangeVariable.h>
48 #include <agrum/tools/database/DBTranslator4ContinuousVariable.h>
49 
50 
51 namespace gum {
52 
53  namespace learning {
54 
55  /** @class DBTranslatorSet
56  * @headerfile DBTranslatorSet.h <agrum/tools/database/DBTranslatorSet.h>
57  * @ingroup learning_database
58  * @brief the class for packing together the translators used to preprocess
59  * the datasets
60  *
61  * When learning Bayesian networks, the records of the train dataset are
62  * used to construct contingency tables that are either exploited in
63  * statistical conditional independence tests or in scores. In both cases,
64  * the values observed in the records must be translated into indices in
65  * the finite domain of the corresponding random variables. The
66  * DBTranslator classes are used for this purpose. To make the parsing
67  * of all the columns of the dataset easier, all the DBTranslator
68  * instances used are gathered into a DBTranslatorSet.
69  *
70  * @par Here is an example of how to use this class:
71  * @code
72  * // create an empty translator set
73  * gum::learning::DBTranslatorSet<> set;
74  *
75  * std::vector<std::string> missing { "?", "N/A", "???" };
76  *
77  * // create the translators and add them to the translator set. First,
78  * // create translator1 that will perform its translations on Column 1
79  * // of the dataset (columns start from index 0)
80  * gum::learning::DBTranslator4LabelizedVariable<> translator1 ( missing, 3 );
81  * std::size_t pos1 = set.insertTranslator ( translator1, 1 );
82  * // currently, pos1 is equal to 0, that is, translator1 is the first
83  * // translator in the translator set
84  *
85  * // create a translator handling Column 0 of the dataset
86  * gum::learning::DBTranslator4ContinuousVariable<> translator0;
87  * std::size_t pos0 = set.insertTranslator ( translator0, 0 );
88  * // translator0 has been inserted into the translator set at position pos0.
89  * // pos0 = 0 because translators are sorted by increasing column order in
90  * // the translator set. So, now, in the set, the first translator is
91  * // translator0 and the the second one is translator1.
92  *
93  * gum::learning::DBTranslator4LabelizedVariable<> translator2;
94  * std::size_t pos2 = set.insertTranslator ( translator2, 2 );
95  * // the set contains { translator0, translator1, translator2 }, in this order
96  *
97  * // parsing the rows of the dataset
98  * std::vector<std::string> row1 { ".33", "toto", "titi" };
99  * float val0 = set.translate ( row1, 0 ).cont_val; // val0 = 0.33f
100  * std::size_t val1 = set.translate ( row1, 1 ).discr_val; // val1 = 0 (toto)
101  * std::size_t val2 = set.translate ( row1, 2 ).discr_val; // val2 = 0 (titi)
102  * std::vector<std::string> row2 { "4.22x", "???", "??" };
103  * val0 = set.translate ( row2, 0 ).cont_val; // raises gum::TypeError
104  * val1 = set.translate ( row2, 1 ).discr_val;
105  * // = std::numeric_limits<std::size_t>::max ()
106  * val2 = set.translate ( row2, 2 ).discr_val; // = 1 (??)
107  *
108  * // with method translateSafe, an exception is raised whenever we try to
109  * // translate a column that is not taken into account by the translators
110  * set.translateSafe ( row2, 3 ); // raises gum::UndefinedElement
111  * @endcode
112  */
113  template < template < typename > class ALLOC = std::allocator >
115  public:
116  /// type for the allocators passed in arguments of methods
118 
119  // ##########################################################################
120  /// @name Constructors / Destructors
121  // ##########################################################################
122 
123  /// @{
124 
125  /// default constructor
126  DBTranslatorSet(const allocator_type& alloc = allocator_type());
127 
128  /// copy constructor
129  DBTranslatorSet(const DBTranslatorSet< ALLOC >& from);
130 
131  /// copy constructor with a given allocator
132  DBTranslatorSet(const DBTranslatorSet< ALLOC >& from,
133  const allocator_type& alloc);
134 
135  /// move constructor
136  DBTranslatorSet(DBTranslatorSet< ALLOC >&& from);
137 
138  /// move constructor with a given allocator
139  DBTranslatorSet(DBTranslatorSet< ALLOC >&& from,
140  const allocator_type& alloc);
141 
142  /// virtual copy constructor
143  virtual DBTranslatorSet< ALLOC >* clone() const;
144 
145  /// virtual copy constructor with a given allocator
146  virtual DBTranslatorSet< ALLOC >* clone(const allocator_type& alloc) const;
147 
148  /// destructor
149  virtual ~DBTranslatorSet();
150 
151  /// @}
152 
153  // ##########################################################################
154  /// @name Operators
155  // ##########################################################################
156 
157  /// @{
158 
159  /// copy operator
161 
162  /// move operator
164 
165  /// returns the kth translator
166  /** @warning this operator assumes that there are at least k translators.
167  * So, it won't check that the kth translator actually exists. If unsure,
168  * use method translatorSafe that performs this check. */
169  DBTranslator< ALLOC >& operator[](const std::size_t k);
170 
171  /// returns the kth translator
172  /** @warning this operator assumes that there are at least k translators.
173  * So, it won't check that the kth translator actually exists. If unsure,
174  * use method translatorSafe that performs this check. */
175  const DBTranslator< ALLOC >& operator[](const std::size_t k) const;
176 
177  /// @}
178 
179 
180  // ##########################################################################
181  /// @name Accessors / Modifiers
182  // ##########################################################################
183 
184  /// @{
185 
186  /// inserts a new translator at the end of the translator set
187  /** @param translator a translator that will be copied into the
188  * translator set
189  * @param column the index of the column that this new translator should
190  * read in the database.
191  * @param unique_column indicates whether the column can be read by
192  * several translators.
193  * @return the position of the translator within the translator set.
194  * @throw DuplicateElement is raised if there already exists a translator
195  * reading the column passed in argument and the unique_column
196  * argument is set to true. */
197  template < template < template < typename > class > class Translator >
198  std::size_t insertTranslator(const Translator< ALLOC >& translator,
199  const std::size_t column,
200  const bool unique_column = true);
201 
202  /** @brief inserts a new translator for a given variable at the end of
203  * the translator set
204  *
205  * @param var the variable that will be contained into the translator
206  * @param column the index of the column that this new translator should
207  * read in the database.
208  * @param missing_symbols the set of symbols in the database
209  * representing missing values
210  * @param unique_column indicates whether the column can be read by
211  * several translators.
212  * @throw DuplicateElement is raised if there already exists a translator
213  * reading the column passed in argument and the unique_column
214  * argument is set to true.
215  */
216  template < template < typename > class XALLOC >
217  std::size_t insertTranslator(
218  const Variable& var,
219  const std::size_t column,
220  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
221  const bool unique_column = true);
222 
223  /** @brief inserts a new translator for a given variable at the end of
224  * the translator set
225  *
226  * @param var the variable that will be contained into the translator
227  * @param column the index of the column that this new translator should
228  * read in the database.
229  * @param unique_column indicates whether the column can be read by
230  * several translators.
231  * @throw DuplicateElement is raised if there already exists a translator
232  * reading the column passed in argumentt and the unique_column
233  * argument is set to true.
234  */
235  std::size_t insertTranslator(const Variable& var,
236  const std::size_t column,
237  const bool unique_column = true);
238 
239  /** @brief erases either the kth translator or those parsing the kth
240  * column of the input database
241  *
242  * DBTranslatorSets do not necessarily read all the columns of their
243  * input database. For instance, a CSV may contain 10 columns, but the
244  * DBTranslatorSet may only contain two translators reading columns 3
245  * and 5 respectively. When k_is_input_col is set to false, Parameter k
246  * passed in argument corresponds to either 0 or 1, i.e., to the index of
247  * one of the two translators stored into the DBTranslatorSet. When
248  * k_is_input_col is set to true, the translators to be erased are the ones
249  * that parse the kth column of the input database (when several
250  * translators parse the column k, all of them are removed).
251  * @warning if the translator does not exists, nothing is done. In
252  * particular, no exception is raised. */
253  void eraseTranslator(const std::size_t k, const bool k_is_input_col = false);
254 
255  /// returns the kth translator
256  /** @warning this method assumes that there are at least k translators.
257  * So, it won't check that the kth translator actually exists. If unsure,
258  * use method translatorSafe that performs this check.*/
259  DBTranslator< ALLOC >& translator(const std::size_t k);
260 
261  /// returns the kth translator
262  /** @warning this method assumes that there are at least k translators.
263  * So, it won't check that the kth translator actually exists. If unsure,
264  * use method translatorSafe that performs this check.*/
265  const DBTranslator< ALLOC >& translator(const std::size_t k) const;
266 
267  /// returns the kth translator
268  /** @throw UndefinedElement is raised if there are fewer than k
269  * translators in the translator set. */
271 
272  /// returns the kth translator
273  /** @throw UndefinedElement is raised if there are fewer than k
274  * translators in the translator set. */
275  const DBTranslator< ALLOC >& translatorSafe(const std::size_t k) const;
276 
277  /// ask the kth translator to translate a string in a row of the database
278  /** @param row a row of the original database
279  * @param k the index of the translator that will perform the translation
280  * @warning this method assumes that there are at least k translators.
281  * So, it won't check that the kth translator actually exists. If unsure,
282  * use method translateSafe that performs this check.
283  * @warning as there is not necessarily an identity mapping between the
284  * set of columns of the database and the set of translators used, k may
285  * not necessarily corresponds to the index of a column in the database:
286  * this is the index of a translator within the set */
287  template < template < typename > class OTHER_ALLOC >
289  const std::vector< std::string, OTHER_ALLOC< std::string > >& row,
290  const std::size_t k) const;
291 
292  /** @brief similar to method translate, except that it checks that the kth
293  * translator exists
294  *
295  * @param row a row of the original database
296  * @param k the index of the translator that will perform the translation
297  * @throw UndefinedElement is raised if there are fewer than k
298  * translators in the translator set.
299  * @warning as there is not necessarily an identity mapping between the
300  * set of columns of the database and the set of translators used, k may
301  * not necessarily corresponds to the index of a column in the database:
302  * this is the index of a translator within the set */
303  template < template < typename > class OTHER_ALLOC >
305  const std::vector< std::string, OTHER_ALLOC< std::string > >& row,
306  const std::size_t k) const;
307 
308  /// returns the original string that was translated into translated_val
309  /** @param translated_val the value from which we look for the
310  * original string
311  * @param k the index of the translator that performed the translation
312  * @warning this method assumes that there are at least k translators.
313  * So, it won't check that the kth translator actually exists. If unsure,
314  * use method translateBackSafe that performs this check.
315  * @warning as there is not necessarily an identity mapping between the
316  * set of columns of the database and the set of translators used, k may
317  * not necessarily corresponds to the index of a column in the database:
318  * this is the index of a translator within the set */
320  const std::size_t k) const;
321 
322  /** @brief similar to method translateBack, except that it checks that
323  * the kth translator exists
324  *
325  * @param translated_val the value from which we look for the
326  * original string
327  * @param k the index of the translator that performed the translation
328  * @throw UndefinedElement is raised if there are fewer than k
329  * translators in the translator set.
330  * @warning as there is not necessarily an identity mapping between the
331  * set of columns of the database and the set of translators used, k may
332  * not necessarily corresponds to the index of a column in the database:
333  * this is the index of a translator within the set */
335  const std::size_t k) const;
336 
337  /** @brief indicates whether the kth translator considers a translated_val
338  * as a missing value
339  *
340  * @param translated_val the value that we compare to the translation of
341  * a missing value
342  * @param k the index of the translator that performed the translation
343  * @warning this method assumes that there are at least k translators.
344  * So, it won't check that the kth translator actually exists. If unsure,
345  * use method isMissingValueSafe that performs this check.
346  */
347  bool isMissingValue(const DBTranslatedValue translated_val,
348  const std::size_t k) const;
349 
350  /** @brief similar to method isMissingValue, except that it checks that
351  * the kth translator exists
352  *
353  * @param translated_val the value that we compare to the translation of
354  * a missing value
355  * @param k the index of the translator that performed the translation
356  * @throw UndefinedElement is raised if there are fewer than k
357  * translators in the translator set.
358  */
359  bool isMissingValueSafe(const DBTranslatedValue translated_val,
360  const std::size_t k) const;
361 
362  /// returns the domain size of the variable stored into the kth translator
363  /** @warning this method assumes that there are at least k translators.
364  * So, it won't check that the kth translator actually exists. If unsure,
365  * use method domainSizeSafe that performs this check. */
366  std::size_t domainSize(const std::size_t k) const;
367 
368  /// returns the domain size of the variable stored into the kth translator
369  /** @throw UndefinedElement is raised if there are fewer than k
370  * translators in the translator set. */
371  std::size_t domainSizeSafe(const std::size_t k) const;
372 
373  /// returns the variable stored into the kth translator
374  /** @warning this method assumes that there are at least k translators.
375  * So, it won't check that the kth translator actually exists. If unsure,
376  * use method variableSafe that performs this check. */
377  const Variable& variable(const std::size_t k) const;
378 
379  /// returns the variable stored into the kth translator
380  /** @throw UndefinedElement is raised if there are fewer than k
381  * translators in the translator set. */
382  const Variable& variableSafe(const std::size_t k) const;
383 
384  /** @brief indicates whether a reordering is needed to make the kth
385  * translator sorted
386  *
387  * For a given translator, if the strings represented by the translations
388  * are only numbers, the translations are considered to be sorted if and
389  * only if they are sorted by increasing number. If the strings do not
390  * only represent numbers, then translations are considered to be sorted
391  * if and only if they are sorted lexicographically.
392  *
393  * When constructing dynamically its dictionary, the translator may
394  * assign wrong DBTranslatedValue values to strings. For instance, a
395  * translator reading sequentially integer strings 4, 1, 3, may map
396  * 4 into DBTranslatedValue{std::size_t(0)},
397  * 1 into DBTranslatedValue{std::size_t(1)} and
398  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
399  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
400  * a domain specified with increasing values. This requires a
401  * reordering. Method needsReodering() returns a Boolean indicating
402  * whether such a reordering should be performed or whether the current
403  * order is OK.
404  * @warning this method assumes that there are at least k translators.
405  * So, it won't check that the kth translator actually exists. If unsure,
406  * use method needsReorderingSafe that performs this check. */
407  bool needsReordering(const std::size_t k) const;
408 
409  /// same as method needsReordering but checks that the kth translator exists
410  /** @throw UndefinedElement is raised if there are fewer than k
411  * translators in the translator set. */
412  bool needsReorderingSafe(const std::size_t k) const;
413 
414  /** @brief performs a reordering of the dictionary and returns a mapping
415  * from the old translated values to the new ones.
416  *
417  * When a reordering is needed, i.e., string values must be translated
418  * differently, Method reorder() computes how the translations should be
419  * changed. It updates accordingly the dictionary and returns the mapping
420  * that enables changing the old dictionary values into the new ones.
421  * Note that the hash table returned is expressed in terms of std::size_t
422  * because only the translations for discrete random variables need be
423  * reordered, those for continuous random variables are identity mappings.
424  * @warning this method assumes that there are at least k translators.
425  * So, it won't check that the kth translator actually exists. If unsure,
426  * use method reorderSafe that performs this check. */
427  HashTable< std::size_t,
428  std::size_t,
429  ALLOC< std::pair< std::size_t, std::size_t > > >
430  reorder(const std::size_t k);
431 
432  /// same as method reorder but checks that the kth translator exists
433  /** @throw UndefinedElement is raised if there are fewer than k
434  * translators in the translator set. */
435  HashTable< std::size_t,
436  std::size_t,
437  ALLOC< std::pair< std::size_t, std::size_t > > >
438  reorderSafe(const std::size_t k);
439 
440  /** @brief returns the column of the input database that will be read
441  * by the kth translator
442  *
443  * @warning this method assumes that there are at least k translators.
444  * So, it won't check that the kth translator actually exists. If unsure,
445  * use method inputColumnSafe that performs this check. */
446  std::size_t inputColumn(const std::size_t k) const;
447 
448  /** @brief returns the column of the input database that will be read
449  * by the kth translator
450  * @throw UndefinedElement is raised if there are fewer than k
451  * translators in the translator set. */
452  std::size_t inputColumnSafe(const std::size_t k) const;
453 
454  /// returns the largest input database column index read by the translators
455  std::size_t highestInputColumn() const;
456 
457  /// remove all the translators
458  void clear();
459 
460  /// returns the number of translators stored into the set
461  std::size_t nbTranslators() const;
462 
463  /// returns the number of translators stored into the set
464  std::size_t size() const;
465 
466  /// returns the allocator used by the translator set
468 
469  /// returns the set of translators
470  const std::vector< DBTranslator< ALLOC >*, ALLOC< DBTranslator< ALLOC >* > >&
471  translators() const;
472 
473  /// @}
474 
475 #ifndef DOXYGEN_SHOULD_SKIP_THIS
476 
477  private:
478  // the set of all the translators
481 
482  // a vector indicating which column of the original database each
483  // translator should translate
484  std::vector< std::size_t, ALLOC< std::size_t > > columns__;
485 
486  // the highest column index read by the translators
487  std::size_t highest_column__{std::size_t(0)};
488 
489  /// copy the content of another translator set that uses another allocator
490  void copy__(const DBTranslatorSet< ALLOC >& from,
491  const allocator_type& alloc);
492 
493 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
494  };
495 
496  } /* namespace learning */
497 
498 } /* namespace gum */
499 
500 // always include templated implementation
501 #include <agrum/tools/database/DBTranslatorSet_tpl.h>
502 
503 #endif /* GUM_LEARNING_DB_TRANSLATOR_SET_H */
HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder(const std::size_t k)
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
const Variable & variable(const std::size_t k) const
returns the variable stored into the kth translator
DBTranslatorSet(const DBTranslatorSet< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
const std::vector< DBTranslator< ALLOC > *, ALLOC< DBTranslator< ALLOC > *> > & translators() const
returns the set of translators
void eraseTranslator(const std::size_t k, const bool k_is_input_col=false)
erases either the kth translator or those parsing the kth column of the input database ...
DBTranslatorSet(const DBTranslatorSet< ALLOC > &from)
copy constructor
DBTranslatorSet< ALLOC > & operator=(const DBTranslatorSet< ALLOC > &from)
copy operator
std::string translateBack(const DBTranslatedValue translated_val, const std::size_t k) const
returns the original string that was translated into translated_val
const Variable & variableSafe(const std::size_t k) const
returns the variable stored into the kth translator
std::size_t domainSizeSafe(const std::size_t k) const
returns the domain size of the variable stored into the kth translator
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
std::size_t highestInputColumn() const
returns the largest input database column index read by the translators
DBTranslator< ALLOC > & translatorSafe(const std::size_t k)
returns the kth translator
std::size_t insertTranslator(const Variable &var, const std::size_t column, const bool unique_column=true)
inserts a new translator for a given variable at the end of the translator set
const DBTranslator< ALLOC > & operator[](const std::size_t k) const
returns the kth translator
std::size_t inputColumnSafe(const std::size_t k) const
returns the column of the input database that will be read by the kth translator
DBTranslatorSet(DBTranslatorSet< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
std::size_t nbTranslators() const
returns the number of translators stored into the set
DBTranslator< ALLOC > & operator[](const std::size_t k)
returns the kth translator
std::size_t domainSize(const std::size_t k) const
returns the domain size of the variable stored into the kth translator
std::size_t insertTranslator(const Variable &var, const std::size_t column, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool unique_column=true)
inserts a new translator for a given variable at the end of the translator set
DBTranslator< ALLOC > & translator(const std::size_t k)
returns the kth translator
bool isMissingValueSafe(const DBTranslatedValue translated_val, const std::size_t k) const
similar to method isMissingValue, except that it checks that the kth translator exists ...
allocator_type getAllocator() const
returns the allocator used by the translator set
std::size_t insertTranslator(const Translator< ALLOC > &translator, const std::size_t column, const bool unique_column=true)
inserts a new translator at the end of the translator set
bool needsReordering(const std::size_t k) const
indicates whether a reordering is needed to make the kth translator sorted
virtual ~DBTranslatorSet()
destructor
std::size_t inputColumn(const std::size_t k) const
returns the column of the input database that will be read by the kth translator
const DBTranslator< ALLOC > & translator(const std::size_t k) const
returns the kth translator
std::string translateBackSafe(const DBTranslatedValue translated_val, const std::size_t k) const
similar to method translateBack, except that it checks that the kth translator exists ...
const DBTranslator< ALLOC > & translatorSafe(const std::size_t k) const
returns the kth translator
std::size_t size() const
returns the number of translators stored into the set
DBTranslatorSet< ALLOC > & operator=(DBTranslatorSet< ALLOC > &&from)
move operator
DBTranslatedValue translateSafe(const std::vector< std::string, OTHER_ALLOC< std::string > > &row, const std::size_t k) const
similar to method translate, except that it checks that the kth translator exists ...
bool needsReorderingSafe(const std::size_t k) const
same as method needsReordering but checks that the kth translator exists
DBTranslatedValue translate(const std::vector< std::string, OTHER_ALLOC< std::string > > &row, const std::size_t k) const
ask the kth translator to translate a string in a row of the database
void clear()
remove all the translators
HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorderSafe(const std::size_t k)
same as method reorder but checks that the kth translator exists
virtual DBTranslatorSet< ALLOC > * clone() const
virtual copy constructor
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
virtual DBTranslatorSet< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
bool isMissingValue(const DBTranslatedValue translated_val, const std::size_t k) const
indicates whether the kth translator considers a translated_val as a missing value ...
DBTranslatorSet(DBTranslatorSet< ALLOC > &&from)
move constructor