aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBTranslatorSet.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief A class for storing several translators
24  *
25  * When learning Bayesian networks, the records of the train dataset are
26  * used to construct contingency tables that are either exploited in
27  * statistical conditional independence tests or in scores. In both cases,
28  * the values observed in the records must be translated into indices in
29  * the finite domain of the corresponding random variables. DBTranslator
30  * instances are used for this purpose. To make the parsing of all the columns
31  * of the dataset easier, all the DBTranslator instances used are gathered
32  * into a single DBRowTranslatorSet.
33  *
34  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
35  */
36 #ifndef GUM_LEARNING_DB_TRANSLATOR_SET_H
37 #define GUM_LEARNING_DB_TRANSLATOR_SET_H
38 
39 #include <vector>
40 #include <limits>
41 #include <type_traits>
42 
43 #include <agrum/agrum.h>
44 #include <agrum/tools/database/DBTranslator.h>
45 #include <agrum/tools/database/DBTranslator4LabelizedVariable.h>
46 #include <agrum/tools/database/DBTranslator4DiscretizedVariable.h>
47 #include <agrum/tools/database/DBTranslator4RangeVariable.h>
48 #include <agrum/tools/database/DBTranslator4ContinuousVariable.h>
49 
50 
51 namespace gum {
52 
53  namespace learning {
54 
55  /** @class DBTranslatorSet
56  * @headerfile DBTranslatorSet.h <agrum/tools/database/DBTranslatorSet.h>
57  * @ingroup learning_database
58  * @brief the class for packing together the translators used to preprocess
59  * the datasets
60  *
61  * When learning Bayesian networks, the records of the train dataset are
62  * used to construct contingency tables that are either exploited in
63  * statistical conditional independence tests or in scores. In both cases,
64  * the values observed in the records must be translated into indices in
65  * the finite domain of the corresponding random variables. The
66  * DBTranslator classes are used for this purpose. To make the parsing
67  * of all the columns of the dataset easier, all the DBTranslator
68  * instances used are gathered into a DBTranslatorSet.
69  *
70  * @par Here is an example of how to use this class:
71  * @code
72  * // create an empty translator set
73  * gum::learning::DBTranslatorSet<> set;
74  *
75  * std::vector<std::string> missing { "?", "N/A", "???" };
76  *
77  * // create the translators and add them to the translator set. First,
78  * // create translator1 that will perform its translations on Column 1
79  * // of the dataset (columns start from index 0)
80  * gum::learning::DBTranslator4LabelizedVariable<> translator1 ( missing, 3 );
81  * std::size_t pos1 = set.insertTranslator ( translator1, 1 );
82  * // currently, pos1 is equal to 0, that is, translator1 is the first
83  * // translator in the translator set
84  *
85  * // create a translator handling Column 0 of the dataset
86  * gum::learning::DBTranslator4ContinuousVariable<> translator0;
87  * std::size_t pos0 = set.insertTranslator ( translator0, 0 );
88  * // translator0 has been inserted into the translator set at position pos0.
89  * // pos0 = 0 because translators are sorted by increasing column order in
90  * // the translator set. So, now, in the set, the first translator is
91  * // translator0 and the the second one is translator1.
92  *
93  * gum::learning::DBTranslator4LabelizedVariable<> translator2;
94  * std::size_t pos2 = set.insertTranslator ( translator2, 2 );
95  * // the set contains { translator0, translator1, translator2 }, in this order
96  *
97  * // parsing the rows of the dataset
98  * std::vector<std::string> row1 { ".33", "toto", "titi" };
99  * float val0 = set.translate ( row1, 0 ).cont_val; // val0 = 0.33f
100  * std::size_t val1 = set.translate ( row1, 1 ).discr_val; // val1 = 0 (toto)
101  * std::size_t val2 = set.translate ( row1, 2 ).discr_val; // val2 = 0 (titi)
102  * std::vector<std::string> row2 { "4.22x", "???", "??" };
103  * val0 = set.translate ( row2, 0 ).cont_val; // raises gum::TypeError
104  * val1 = set.translate ( row2, 1 ).discr_val;
105  * // = std::numeric_limits<std::size_t>::max ()
106  * val2 = set.translate ( row2, 2 ).discr_val; // = 1 (??)
107  *
108  * // with method translateSafe, an exception is raised whenever we try to
109  * // translate a column that is not taken into account by the translators
110  * set.translateSafe ( row2, 3 ); // raises gum::UndefinedElement
111  * @endcode
112  */
113  template < template < typename > class ALLOC = std::allocator >
115  public:
116  /// type for the allocators passed in arguments of methods
118 
119  // ##########################################################################
120  /// @name Constructors / Destructors
121  // ##########################################################################
122 
123  /// @{
124 
125  /// default constructor
126  DBTranslatorSet(const allocator_type& alloc = allocator_type());
127 
128  /// copy constructor
129  DBTranslatorSet(const DBTranslatorSet< ALLOC >& from);
130 
131  /// copy constructor with a given allocator
132  DBTranslatorSet(const DBTranslatorSet< ALLOC >& from, const allocator_type& alloc);
133 
134  /// move constructor
135  DBTranslatorSet(DBTranslatorSet< ALLOC >&& from);
136 
137  /// move constructor with a given allocator
138  DBTranslatorSet(DBTranslatorSet< ALLOC >&& from, const allocator_type& alloc);
139 
140  /// virtual copy constructor
141  virtual DBTranslatorSet< ALLOC >* clone() const;
142 
143  /// virtual copy constructor with a given allocator
144  virtual DBTranslatorSet< ALLOC >* clone(const allocator_type& alloc) const;
145 
146  /// destructor
147  virtual ~DBTranslatorSet();
148 
149  /// @}
150 
151  // ##########################################################################
152  /// @name Operators
153  // ##########################################################################
154 
155  /// @{
156 
157  /// copy operator
159 
160  /// move operator
162 
163  /// returns the kth translator
164  /** @warning this operator assumes that there are at least k translators.
165  * So, it won't check that the kth translator actually exists. If unsure,
166  * use method translatorSafe that performs this check. */
167  DBTranslator< ALLOC >& operator[](const std::size_t k);
168 
169  /// returns the kth translator
170  /** @warning this operator assumes that there are at least k translators.
171  * So, it won't check that the kth translator actually exists. If unsure,
172  * use method translatorSafe that performs this check. */
173  const DBTranslator< ALLOC >& operator[](const std::size_t k) const;
174 
175  /// @}
176 
177 
178  // ##########################################################################
179  /// @name Accessors / Modifiers
180  // ##########################################################################
181 
182  /// @{
183 
184  /// inserts a new translator at the end of the translator set
185  /** @param translator a translator that will be copied into the
186  * translator set
187  * @param column the index of the column that this new translator should
188  * read in the database.
189  * @param unique_column indicates whether the column can be read by
190  * several translators.
191  * @return the position of the translator within the translator set.
192  * @throw DuplicateElement is raised if there already exists a translator
193  * reading the column passed in argument and the unique_column
194  * argument is set to true. */
195  template < template < template < typename > class > class Translator >
196  std::size_t insertTranslator(const Translator< ALLOC >& translator,
197  const std::size_t column,
198  const bool unique_column = true);
199 
200  /** @brief inserts a new translator for a given variable at the end of
201  * the translator set
202  *
203  * @param var the variable that will be contained into the translator
204  * @param column the index of the column that this new translator should
205  * read in the database.
206  * @param missing_symbols the set of symbols in the database
207  * representing missing values
208  * @param unique_column indicates whether the column can be read by
209  * several translators.
210  * @throw DuplicateElement is raised if there already exists a translator
211  * reading the column passed in argument and the unique_column
212  * argument is set to true.
213  */
214  template < template < typename > class XALLOC >
215  std::size_t
216  insertTranslator(const Variable& var,
217  const std::size_t column,
218  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
219  const bool unique_column = true);
220 
221  /** @brief inserts a new translator for a given variable at the end of
222  * the translator set
223  *
224  * @param var the variable that will be contained into the translator
225  * @param column the index of the column that this new translator should
226  * read in the database.
227  * @param unique_column indicates whether the column can be read by
228  * several translators.
229  * @throw DuplicateElement is raised if there already exists a translator
230  * reading the column passed in argumentt and the unique_column
231  * argument is set to true.
232  */
233  std::size_t insertTranslator(const Variable& var,
234  const std::size_t column,
235  const bool unique_column = true);
236 
237  /** @brief erases either the kth translator or those parsing the kth
238  * column of the input database
239  *
240  * DBTranslatorSets do not necessarily read all the columns of their
241  * input database. For instance, a CSV may contain 10 columns, but the
242  * DBTranslatorSet may only contain two translators reading columns 3
243  * and 5 respectively. When k_is_input_col is set to false, Parameter k
244  * passed in argument corresponds to either 0 or 1, i.e., to the index of
245  * one of the two translators stored into the DBTranslatorSet. When
246  * k_is_input_col is set to true, the translators to be erased are the ones
247  * that parse the kth column of the input database (when several
248  * translators parse the column k, all of them are removed).
249  * @warning if the translator does not exists, nothing is done. In
250  * particular, no exception is raised. */
251  void eraseTranslator(const std::size_t k, const bool k_is_input_col = false);
252 
253  /// returns the kth translator
254  /** @warning this method assumes that there are at least k translators.
255  * So, it won't check that the kth translator actually exists. If unsure,
256  * use method translatorSafe that performs this check.*/
257  DBTranslator< ALLOC >& translator(const std::size_t k);
258 
259  /// returns the kth translator
260  /** @warning this method assumes that there are at least k translators.
261  * So, it won't check that the kth translator actually exists. If unsure,
262  * use method translatorSafe that performs this check.*/
263  const DBTranslator< ALLOC >& translator(const std::size_t k) const;
264 
265  /// returns the kth translator
266  /** @throw UndefinedElement is raised if there are fewer than k
267  * translators in the translator set. */
269 
270  /// returns the kth translator
271  /** @throw UndefinedElement is raised if there are fewer than k
272  * translators in the translator set. */
273  const DBTranslator< ALLOC >& translatorSafe(const std::size_t k) const;
274 
275  /// ask the kth translator to translate a string in a row of the database
276  /** @param row a row of the original database
277  * @param k the index of the translator that will perform the translation
278  * @warning this method assumes that there are at least k translators.
279  * So, it won't check that the kth translator actually exists. If unsure,
280  * use method translateSafe that performs this check.
281  * @warning as there is not necessarily an identity mapping between the
282  * set of columns of the database and the set of translators used, k may
283  * not necessarily corresponds to the index of a column in the database:
284  * this is the index of a translator within the set */
285  template < template < typename > class OTHER_ALLOC >
287  const std::size_t k) const;
288 
289  /** @brief similar to method translate, except that it checks that the kth
290  * translator exists
291  *
292  * @param row a row of the original database
293  * @param k the index of the translator that will perform the translation
294  * @throw UndefinedElement is raised if there are fewer than k
295  * translators in the translator set.
296  * @warning as there is not necessarily an identity mapping between the
297  * set of columns of the database and the set of translators used, k may
298  * not necessarily corresponds to the index of a column in the database:
299  * this is the index of a translator within the set */
300  template < template < typename > class OTHER_ALLOC >
303  const std::size_t k) const;
304 
305  /// returns the original string that was translated into translated_val
306  /** @param translated_val the value from which we look for the
307  * original string
308  * @param k the index of the translator that performed the translation
309  * @warning this method assumes that there are at least k translators.
310  * So, it won't check that the kth translator actually exists. If unsure,
311  * use method translateBackSafe that performs this check.
312  * @warning as there is not necessarily an identity mapping between the
313  * set of columns of the database and the set of translators used, k may
314  * not necessarily corresponds to the index of a column in the database:
315  * this is the index of a translator within the set */
317 
318  /** @brief similar to method translateBack, except that it checks that
319  * the kth translator exists
320  *
321  * @param translated_val the value from which we look for the
322  * original string
323  * @param k the index of the translator that performed the translation
324  * @throw UndefinedElement is raised if there are fewer than k
325  * translators in the translator set.
326  * @warning as there is not necessarily an identity mapping between the
327  * set of columns of the database and the set of translators used, k may
328  * not necessarily corresponds to the index of a column in the database:
329  * this is the index of a translator within the set */
331  const std::size_t k) const;
332 
333  /** @brief indicates whether the kth translator considers a translated_val
334  * as a missing value
335  *
336  * @param translated_val the value that we compare to the translation of
337  * a missing value
338  * @param k the index of the translator that performed the translation
339  * @warning this method assumes that there are at least k translators.
340  * So, it won't check that the kth translator actually exists. If unsure,
341  * use method isMissingValueSafe that performs this check.
342  */
343  bool isMissingValue(const DBTranslatedValue translated_val, const std::size_t k) const;
344 
345  /** @brief similar to method isMissingValue, except that it checks that
346  * the kth translator exists
347  *
348  * @param translated_val the value that we compare to the translation of
349  * a missing value
350  * @param k the index of the translator that performed the translation
351  * @throw UndefinedElement is raised if there are fewer than k
352  * translators in the translator set.
353  */
354  bool isMissingValueSafe(const DBTranslatedValue translated_val, const std::size_t k) const;
355 
356  /// returns the domain size of the variable stored into the kth translator
357  /** @warning this method assumes that there are at least k translators.
358  * So, it won't check that the kth translator actually exists. If unsure,
359  * use method domainSizeSafe that performs this check. */
360  std::size_t domainSize(const std::size_t k) const;
361 
362  /// returns the domain size of the variable stored into the kth translator
363  /** @throw UndefinedElement is raised if there are fewer than k
364  * translators in the translator set. */
365  std::size_t domainSizeSafe(const std::size_t k) const;
366 
367  /// returns the variable stored into the kth translator
368  /** @warning this method assumes that there are at least k translators.
369  * So, it won't check that the kth translator actually exists. If unsure,
370  * use method variableSafe that performs this check. */
371  const Variable& variable(const std::size_t k) const;
372 
373  /// returns the variable stored into the kth translator
374  /** @throw UndefinedElement is raised if there are fewer than k
375  * translators in the translator set. */
376  const Variable& variableSafe(const std::size_t k) const;
377 
378  /** @brief indicates whether a reordering is needed to make the kth
379  * translator sorted
380  *
381  * For a given translator, if the strings represented by the translations
382  * are only numbers, the translations are considered to be sorted if and
383  * only if they are sorted by increasing number. If the strings do not
384  * only represent numbers, then translations are considered to be sorted
385  * if and only if they are sorted lexicographically.
386  *
387  * When constructing dynamically its dictionary, the translator may
388  * assign wrong DBTranslatedValue values to strings. For instance, a
389  * translator reading sequentially integer strings 4, 1, 3, may map
390  * 4 into DBTranslatedValue{std::size_t(0)},
391  * 1 into DBTranslatedValue{std::size_t(1)} and
392  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
393  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
394  * a domain specified with increasing values. This requires a
395  * reordering. Method needsReodering() returns a Boolean indicating
396  * whether such a reordering should be performed or whether the current
397  * order is OK.
398  * @warning this method assumes that there are at least k translators.
399  * So, it won't check that the kth translator actually exists. If unsure,
400  * use method needsReorderingSafe that performs this check. */
401  bool needsReordering(const std::size_t k) const;
402 
403  /// same as method needsReordering but checks that the kth translator exists
404  /** @throw UndefinedElement is raised if there are fewer than k
405  * translators in the translator set. */
406  bool needsReorderingSafe(const std::size_t k) const;
407 
408  /** @brief performs a reordering of the dictionary and returns a mapping
409  * from the old translated values to the new ones.
410  *
411  * When a reordering is needed, i.e., string values must be translated
412  * differently, Method reorder() computes how the translations should be
413  * changed. It updates accordingly the dictionary and returns the mapping
414  * that enables changing the old dictionary values into the new ones.
415  * Note that the hash table returned is expressed in terms of std::size_t
416  * because only the translations for discrete random variables need be
417  * reordered, those for continuous random variables are identity mappings.
418  * @warning this method assumes that there are at least k translators.
419  * So, it won't check that the kth translator actually exists. If unsure,
420  * use method reorderSafe that performs this check. */
422  reorder(const std::size_t k);
423 
424  /// same as method reorder but checks that the kth translator exists
425  /** @throw UndefinedElement is raised if there are fewer than k
426  * translators in the translator set. */
428  reorderSafe(const std::size_t k);
429 
430  /** @brief returns the column of the input database that will be read
431  * by the kth translator
432  *
433  * @warning this method assumes that there are at least k translators.
434  * So, it won't check that the kth translator actually exists. If unsure,
435  * use method inputColumnSafe that performs this check. */
436  std::size_t inputColumn(const std::size_t k) const;
437 
438  /** @brief returns the column of the input database that will be read
439  * by the kth translator
440  * @throw UndefinedElement is raised if there are fewer than k
441  * translators in the translator set. */
442  std::size_t inputColumnSafe(const std::size_t k) const;
443 
444  /// returns the largest input database column index read by the translators
445  std::size_t highestInputColumn() const;
446 
447  /// remove all the translators
448  void clear();
449 
450  /// returns the number of translators stored into the set
451  std::size_t nbTranslators() const;
452 
453  /// returns the number of translators stored into the set
454  std::size_t size() const;
455 
456  /// returns the allocator used by the translator set
458 
459  /// returns the set of translators
460  const std::vector< DBTranslator< ALLOC >*, ALLOC< DBTranslator< ALLOC >* > >&
461  translators() const;
462 
463  /// @}
464 
465 #ifndef DOXYGEN_SHOULD_SKIP_THIS
466 
467  private:
468  // the set of all the translators
470 
471  // a vector indicating which column of the original database each
472  // translator should translate
473  std::vector< std::size_t, ALLOC< std::size_t > > _columns_;
474 
475  // the highest column index read by the translators
476  std::size_t _highest_column_{std::size_t(0)};
477 
478  /// copy the content of another translator set that uses another allocator
479  void _copy_(const DBTranslatorSet< ALLOC >& from, const allocator_type& alloc);
480 
481 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
482  };
483 
484  } /* namespace learning */
485 
486 } /* namespace gum */
487 
488 // always include templated implementation
489 #include <agrum/tools/database/DBTranslatorSet_tpl.h>
490 
491 #endif /* GUM_LEARNING_DB_TRANSLATOR_SET_H */
HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder(const std::size_t k)
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
const Variable & variable(const std::size_t k) const
returns the variable stored into the kth translator
DBTranslatorSet(const DBTranslatorSet< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
const std::vector< DBTranslator< ALLOC > *, ALLOC< DBTranslator< ALLOC > *> > & translators() const
returns the set of translators
void eraseTranslator(const std::size_t k, const bool k_is_input_col=false)
erases either the kth translator or those parsing the kth column of the input database ...
DBTranslatorSet(const DBTranslatorSet< ALLOC > &from)
copy constructor
DBTranslatorSet< ALLOC > & operator=(const DBTranslatorSet< ALLOC > &from)
copy operator
std::string translateBack(const DBTranslatedValue translated_val, const std::size_t k) const
returns the original string that was translated into translated_val
const Variable & variableSafe(const std::size_t k) const
returns the variable stored into the kth translator
std::size_t domainSizeSafe(const std::size_t k) const
returns the domain size of the variable stored into the kth translator
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
std::size_t highestInputColumn() const
returns the largest input database column index read by the translators
DBTranslator< ALLOC > & translatorSafe(const std::size_t k)
returns the kth translator
std::size_t insertTranslator(const Variable &var, const std::size_t column, const bool unique_column=true)
inserts a new translator for a given variable at the end of the translator set
const DBTranslator< ALLOC > & operator[](const std::size_t k) const
returns the kth translator
std::size_t inputColumnSafe(const std::size_t k) const
returns the column of the input database that will be read by the kth translator
DBTranslatorSet(DBTranslatorSet< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
std::size_t nbTranslators() const
returns the number of translators stored into the set
DBTranslator< ALLOC > & operator[](const std::size_t k)
returns the kth translator
std::size_t domainSize(const std::size_t k) const
returns the domain size of the variable stored into the kth translator
std::size_t insertTranslator(const Variable &var, const std::size_t column, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool unique_column=true)
inserts a new translator for a given variable at the end of the translator set
DBTranslator< ALLOC > & translator(const std::size_t k)
returns the kth translator
bool isMissingValueSafe(const DBTranslatedValue translated_val, const std::size_t k) const
similar to method isMissingValue, except that it checks that the kth translator exists ...
allocator_type getAllocator() const
returns the allocator used by the translator set
std::size_t insertTranslator(const Translator< ALLOC > &translator, const std::size_t column, const bool unique_column=true)
inserts a new translator at the end of the translator set
bool needsReordering(const std::size_t k) const
indicates whether a reordering is needed to make the kth translator sorted
virtual ~DBTranslatorSet()
destructor
std::size_t inputColumn(const std::size_t k) const
returns the column of the input database that will be read by the kth translator
const DBTranslator< ALLOC > & translator(const std::size_t k) const
returns the kth translator
std::string translateBackSafe(const DBTranslatedValue translated_val, const std::size_t k) const
similar to method translateBack, except that it checks that the kth translator exists ...
const DBTranslator< ALLOC > & translatorSafe(const std::size_t k) const
returns the kth translator
std::size_t size() const
returns the number of translators stored into the set
DBTranslatorSet< ALLOC > & operator=(DBTranslatorSet< ALLOC > &&from)
move operator
DBTranslatedValue translateSafe(const std::vector< std::string, OTHER_ALLOC< std::string > > &row, const std::size_t k) const
similar to method translate, except that it checks that the kth translator exists ...
bool needsReorderingSafe(const std::size_t k) const
same as method needsReordering but checks that the kth translator exists
DBTranslatedValue translate(const std::vector< std::string, OTHER_ALLOC< std::string > > &row, const std::size_t k) const
ask the kth translator to translate a string in a row of the database
void clear()
remove all the translators
HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorderSafe(const std::size_t k)
same as method reorder but checks that the kth translator exists
virtual DBTranslatorSet< ALLOC > * clone() const
virtual copy constructor
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
virtual DBTranslatorSet< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
bool isMissingValue(const DBTranslatedValue translated_val, const std::size_t k) const
indicates whether the kth translator considers a translated_val as a missing value ...
DBTranslatorSet(DBTranslatorSet< ALLOC > &&from)
move constructor