aGrUM  0.21.0
a C++ library for (probabilistic) graphical models
DBTranslator.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The base class for all the tabular databases' cell translators
24  *
25  * This file contains the basis for interacting with a tabular database as
26  * a translator. Every translator should derive from this class.
27  *
28  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
29  */
30 #ifndef GUM_LEARNING_DB_TRANSLATOR_H
31 #define GUM_LEARNING_DB_TRANSLATOR_H
32 
33 #include <string>
34 #include <limits>
35 #include <vector>
36 #include <utility>
37 
38 #include <agrum/agrum.h>
39 #include <agrum/tools/core/bijection.h>
40 #include <agrum/tools/core/set.h>
41 #include <agrum/tools/variables/variable.h>
42 #include <agrum/tools/database/DBTranslatedValue.h>
43 
44 
45 namespace gum {
46 
47  namespace learning {
48 
49 
50  /** @class DBTranslator
51  * @headerfile DBTranslator.h <agrum/tools/database/DBTranslator.h>
52  * @brief The base class for all the tabular database cell translators
53  *
54  * Translators are used by DatabaseTable instances to transform datasets'
55  * strings into DBTranslatedValue instances. The point is that strings are
56  * not adequate for fast learning, they need to be preprocessed into a type
57  * that can be analyzed quickly (the so-called DBTranslatedValue type).
58  * The DBTranslator class is the abstract base class for all the translators
59  * used in aGrUM.
60  *
61  * Here is an example of how to use it, illustrated with the
62  * DBTranslator4ContinuousVariable class:
63  *
64  * @code
65  * // create the translator, with possible missing symbols: "N/A" and "???"
66  * // i.e., each time the translator reads a "N/A" or a "???" string, it
67  * // won't translate it into a number but into a missing value.
68  * std::vector<std::string> missing { "N/A", "???" };
69  * gum::learning::DBTranslator4ContinuousVariable<> translator ( missing );
70  *
71  * // gets the DBTranslatedValue corresponding to some strings
72  * auto val1 = translator.translate("5"); // val1 = DBTranslatedValue {5.0f}
73  * auto val2 = translator.translate("4.2"); // val2 = DBTRanslatedValue {4.2f}
74  * auto val3 = translator << "3.4"; // val3 = DBTranslatedValue {3.4f}
75  *
76  * // add the numbers assigned to val1, val2, val3
77  * float sum = val1.cont_val + val2.cont_val + val3.cont_val;
78  *
79  * // translate missing values: val4 and val5 will be equal to:
80  * // DBTranslatedValue { std::numeric_limits<float>::max () }
81  * auto val4 = translator << "N/A";
82  * auto val5 = translator.translate ( "???" );
83  *
84  * // the following instructions raise TypeError exceptions because the
85  * // strings cannot be translated into real numbers
86  * auto val6 = translator << "4.22x";
87  * auto val7 = translator.translate ( "xxx" );
88  *
89  * // given a DBTranslatedValue that is supposed to contain a float, get
90  * // the corresponding string. The strings should be equivalent to those
91  * // indicated below (maybe they could contain more zeroes after the dot).
92  * std::string str;
93  * str = translator.translateBack ( val1 ); // str ~ "5.0"
94  * str = translator >> val2; // str ~ "4.2"
95  * str = translator >> gum::learning::DBTranslatedValue {7.2e3f};
96  * // str ~ "7.2 e3"
97  *
98  * // translate back missing values: the string will corresponds to one of
99  * // the missing symbols known to the translator
100  * str = translator >> val4; // str = "N/A" or "???"
101  * str = translator >> val5; // str = "N/A" or "???"
102  *
103  * // get the domain size of the variable stored into the translatator
104  * // This size is only useful for translators with discrete variables
105  * std::size_t size = translator.domainSize ();
106  *
107  * // get the variable stored within the translator
108  * const gum::ContinuousVariable<float>* var =
109  * dynamic_cast<const gum::ContinuousVariable<float>*>
110  * ( translator.variable () );
111  *@endcode
112  *
113  * @ingroup learning_database
114  */
115  template < template < typename > class ALLOC = std::allocator >
117  public:
118  /// type for the allocators passed in arguments of methods
120 
121  // ##########################################################################
122  /// @name Constructors / Destructors
123  // ##########################################################################
124 
125  /// @{
126 
127  /// default constructor
128  /** @param val_type indicates whether the DBTranslator deals with discrete
129  * or continuous variables
130  * @param editable_dictionary indicates whether the dictionary used for
131  * translations can be updated dynamically when observing new string or
132  * whether it should remain constant. To see how this parameter is handled,
133  * see the child classes inheriting from DBTranslator
134  * @param missing_symbols the set of symbols in the database
135  * representing missing values
136  * @param max_dico_entries the max number of entries that the dictionary
137  * can contain. If we try to add new entries in the dictionary, this will
138  * be considered as an error and a SizeError exception will be raised
139  * @param alloc The allocator used to allocate memory for all the
140  * fields of the DBTranslator
141  */
142  template < template < typename > class XALLOC >
143  DBTranslator(DBTranslatedValueType val_type,
144  const bool is_lossless,
145  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
146  const bool editable_dictionary = true,
147  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
148  const allocator_type& alloc = allocator_type());
149 
150  /// default constructor without missing symbols
151  /** @param val_type indicates whether the DBTranslator deals with discrete
152  * or continuous variables
153  * @param editable_dictionary indicates whether the dictionary used for
154  * translations can be updated dynamically when observing new string or
155  * whether it should remain constant. To see how this parameter is handled,
156  * see the child classes inheriting from DBTranslator
157  * @param max_dico_entries the max number of entries that the dictionary
158  * can contain. If we try to add new entries in the dictionary, this will
159  * be considered as an error and a SizeError exception will be raised
160  * @param alloc The allocator used to allocate memory for all the
161  * fields of the DBTranslator
162  */
163  DBTranslator(DBTranslatedValueType val_type,
164  const bool is_lossless,
165  const bool editable_dictionary = true,
166  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
167  const allocator_type& alloc = allocator_type());
168 
169  /// copy constructor
170  DBTranslator(const DBTranslator< ALLOC >& from);
171 
172  /// copy constructor with a given allocator
173  DBTranslator(const DBTranslator< ALLOC >& from, const allocator_type& alloc);
174 
175  /// move constructor
176  DBTranslator(DBTranslator< ALLOC >&& from);
177 
178  /// move constructor with a given allocator
179  DBTranslator(DBTranslator< ALLOC >&& from, const allocator_type& alloc);
180 
181  /// virtual copy constructor
182  virtual DBTranslator< ALLOC >* clone() const = 0;
183 
184  /// virtual copy constructor with a given allocator
185  virtual DBTranslator< ALLOC >* clone(const allocator_type& alloc) const = 0;
186 
187  /// destructor
188  virtual ~DBTranslator();
189 
190  /// @}
191 
192 
193  // ##########################################################################
194  /// @name Operators
195  // ##########################################################################
196 
197  /// @{
198 
199  /// alias for method translate
201 
202  /// alias for method translateBack
203  std::string operator>>(const DBTranslatedValue translated_val);
204 
205  /// @}
206 
207 
208  // ##########################################################################
209  /// @name Accessors / Modifiers
210  // ##########################################################################
211 
212  /// @{
213 
214  /// returns the translation of a string
215  /** This method tries to translate a given string into the
216  * DBTranslatedValue that should be stored into a DatabaseTable. If the
217  * translator cannot find the translation in its current dictionary, then
218  * two situations can
219  * obtain:
220  * -# if the translator is not in an editable dictionary mode, then the
221  * translator raises a NotFound exception.
222  * -# if the translator is in an editable dictionary mode, i.e., it is
223  * allowed to update its dictionary, then it tries to add the string
224  * as a new value in the dictionary. Upon success, it returns the
225  * translated value, otherwise, it raises either:
226  * - a SizeError exception if the number of entries in the dictionary
227  * has already reached its maximum,
228  * - a TypeError exception if the string cannot be converted into a
229  * value that can be inserted into the dictionary
230  * - an OperationNotAllowed exception if the translation would induce
231  * incoherent behavior (e.g., a DBTranslator4ContinuousVariable that
232  * contains a variable whose domain is [x,y] as well as a missing
233  * value symbol z \f$\in\f$ [x,y]).
234  *
235  * @warning Note that missing values (i.e., string encoded as missing
236  * symbols) are translated as std::numeric_limits<>::max ().
237  * @param str the string that the DBTranslator will try to translate
238  * @return the translated value of the string to be stored into a
239  * DatabaseTable
240  * @throws UnknownLabelInDatabase is raised if the translation cannot
241  * be found and the translator is not in an editable dictionary mode.
242  * @throws SizeError is raised if the number of entries in the dictionary
243  * has already reached its maximum.
244  * @throws OperationNotAllowed exception is raised if the translation
245  * cannot be found and the insertion of the string into the translator's
246  * dictionary fails because it would induce incoherent behavior (e.g.,
247  * a DBTranslator4ContinuousVariable that contains a variable whose domain
248  * is [x,y] as well as a missing value symbol z \f$\in\f$ [x,y]).
249  * @throws TypeError is raised if the translation cannot be found and
250  * the insertion of the string into the translator's dictionary fails
251  * due to str being impossible to be converted into an appropriate type. */
252  virtual DBTranslatedValue translate(const std::string& str) = 0;
253 
254  /// returns the original value for a given translation
255  /** @param translated_val a value that should result from a translation
256  * and for which we are looking for the corresponding DBTranslator's
257  * variable's label (a string)
258  * @return the string that was translated into a given DBTranslatedValue.
259  * @warning when the translator is not a proper bijection, like, e.g.,
260  * DBTranslator4DiscretizedVariable, the method returns the value of
261  * the random variable corresponding to translated_val (i.e., for a
262  * discretized variable, it would return the interval corresponding to
263  * translated_val).
264  * @throws UnknownLabelInDatabase is raised if this original value
265  * cannot be found */
266  virtual std::string translateBack(const DBTranslatedValue translated_val) const = 0;
267 
268  /// returns the domain size of a variable corresponding to the translations
269  /** Assume that the translator has been fed with the observed values of
270  * a random variable. Then it has produced a set of translated values. The
271  * latter define the domain of the variable. When the variable is discrete,
272  * values are assumed to span from 0 to a number n-1. In this case, the
273  * domain size of the variable is n. When the function is continuous,
274  * the domain size should be infinite and we return a
275  * std::numeric_limits<std::size_t>::max() to represent it. Note that
276  * missing values are encoded as std::numeric_limits<>::max () and are
277  * not taken into account in the domain sizes. */
278  virtual std::size_t domainSize() const = 0;
279 
280  /// indicates whether the translator has an editable dictionary or not
281  virtual bool hasEditableDictionary() const;
282 
283  /// sets/unset the editable dictionary mode
284  virtual void setEditableDictionaryMode(bool new_mode);
285 
286  /// returns the translation from database indices to input strings
287  virtual const Bijection< std::size_t, std::string,
288  ALLOC< std::pair< std::size_t, std::string > > >&
289  getDictionary () const;
290 
291  /** @brief indicates whether a reordering is needed to make the
292  * translations sorted
293  *
294  * If the strings represented by the translations are only numbers,
295  * translations are considered to be sorted if and only if they are sorted
296  * by increasing number. If the strings do not only represent numbers, then
297  * translations are considered to be sorted if and only if they are sorted
298  * lexicographically.
299  *
300  * When constructing dynamically its dictionary, the translator may
301  * assign wrong DBTranslatedValue values to strings. For instance, a
302  * translator reading sequentially integer strings 4, 1, 3, may map
303  * 4 into DBTranslatedValue{std::size_t(0)},
304  * 1 into DBTranslatedValue{std::size_t(1)} and
305  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
306  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
307  * a domain specified with increasing values. This requires a
308  * reordering. Method needsReodering() returns a Boolean indicating
309  * whether such a reordering should be performed or whether the current
310  * order is OK. */
311  virtual bool needsReordering() const = 0;
312 
313  /** @brief performs a reordering of the dictionary and returns a mapping
314  * from the old translated values to the new ones.
315  *
316  * When a reordering is needed, i.e., string values must be translated
317  * differently, Method reorder() computes how the translations should be
318  * changed. It updates accordingly the dictionary and returns the mapping
319  * that enables changing the old dictionary values into the new ones.
320  * Note that the hash table returned is expressed in terms of std::size_t
321  * because only the translations for discrete random variables need be
322  * reordered, those for continuous random variables are identity mappings.
323  * @warning If there is no reordering to perform, the method returns
324  * an empty hashtable. */
325  virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
326  reorder() = 0;
327 
328  /// returns the set of missing symbols taken into account by the translator
329  const Set< std::string, ALLOC< std::string > >& missingSymbols() const;
330 
331  /// indicates whether a string corresponds to a missing symbol
332  bool isMissingSymbol(const std::string& str) const;
333 
334  /// returns the variable stored into the translator
335  virtual const Variable* variable() const = 0;
336 
337  /// sets the name of the variable stored into the translator
338  void setVariableName(const std::string& str) const;
339 
340  /// sets the name of the variable stored into the translator
341  void setVariableDescription(const std::string& str) const;
342 
343  /// returns the type of values handled by the translator
344  /** @returns either DBTranslatedValueType::DISCRETE if the translator
345  * includes a discrete variable or DBTranslatedValueType::CONTINUOUS if
346  * it contains a continuous variable. This is convenient to know how to
347  * interpret the DBTranslatedValue instances produced by the DBTranslator:
348  * either using their discr_val field or their cont_val field. */
350 
351  /// returns a Boolean indicating whether the translation is lossless or not
352  /** Some translations can lose some information. For instance, a translator for a
353  * discretized variable will translate all the values of a discretization interval as
354  * the same value (the index of the interval). As such it looses some information
355  * because, knowing this index, it is impossible to get back to the original value that
356  * was translated. Method isLossless() indicates whether the translation never loses
357  * any information or not. */
358  bool isLossless() const;
359 
360  /// returns the allocator used by the translator
362 
363  /// indicates whether a translated value corresponds to a missing value
364  bool isMissingValue(const DBTranslatedValue& val) const;
365 
366  /// returns the translation of a missing value
367  virtual DBTranslatedValue missingValue() const = 0;
368 
369  /// @}
370 
371 
372  protected:
373  // ##########################################################################
374  /// @name Protected Operators
375  // ##########################################################################
376 
377  /// @{
378 
379  /// copy operator
381 
382  /// move operator
384 
385  /// @}
386 
387  /// indicates whether the translation is lossless (e.g., ranges) or not
389 
390  /// indicates whether the dictionary can be updated or not
392 
393  /// the maximum number of entries that the dictionary is allowed to contain
394  std::size_t max_dico_entries_;
395 
396  /// the set of missing symbols
398 
399  /// the bijection relating back translated values and their original strings.
400  /** Note that the translated values considered here are of type std::size_t
401  * because only the values for discrete variables need be stored, those
402  * for continuous variables are actually identity mappings.
403  * @warning only the values of the random variable are stored into this
404  * bijection. Missing values are not considered here. */
405  mutable Bijection< std::size_t, std::string, ALLOC< std::pair< std::size_t, std::string > > >
407 
408  /// the type of the values translated by the translator
410  };
411 
412 
413  } /* namespace learning */
414 
415 } /* namespace gum */
416 
417 // always include the template implementation
418 #include <agrum/tools/database/DBTranslator_tpl.h>
419 
420 #endif /* GUM_LEARNING_DB_TRANSLATOR_H */
bool isLossless() const
returns a Boolean indicating whether the translation is lossless or not
virtual DBTranslator< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
DBTranslatedValue operator<<(const std::string &str)
alias for method translate
virtual DBTranslatedValue missingValue() const =0
returns the translation of a missing value
virtual DBTranslatedValue translate(const std::string &str)=0
returns the translation of a string
virtual std::string translateBack(const DBTranslatedValue translated_val) const =0
returns the original value for a given translation
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
std::size_t max_dico_entries_
the maximum number of entries that the dictionary is allowed to contain
Definition: DBTranslator.h:394
void setVariableName(const std::string &str) const
sets the name of the variable stored into the translator
virtual const Variable * variable() const =0
returns the variable stored into the translator
bool is_lossless_
indicates whether the translation is lossless (e.g., ranges) or not
Definition: DBTranslator.h:388
virtual DBTranslator< ALLOC > * clone() const =0
virtual copy constructor
virtual ~DBTranslator()
destructor
virtual bool needsReordering() const =0
indicates whether a reordering is needed to make the translations sorted
const Set< std::string, ALLOC< std::string > > & missingSymbols() const
returns the set of missing symbols taken into account by the translator
DBTranslator(const DBTranslator< ALLOC > &from)
copy constructor
allocator_type getAllocator() const
returns the allocator used by the translator
bool isMissingValue(const DBTranslatedValue &val) const
indicates whether a translated value corresponds to a missing value
DBTranslator(DBTranslator< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
The base class for all the tabular database cell translators.
Definition: DBTranslator.h:116
DBTranslator< ALLOC > & operator=(const DBTranslator< ALLOC > &from)
copy operator
virtual void setEditableDictionaryMode(bool new_mode)
sets/unset the editable dictionary mode
bool is_dictionary_dynamic_
indicates whether the dictionary can be updated or not
Definition: DBTranslator.h:391
DBTranslatedValueType val_type_
the type of the values translated by the translator
Definition: DBTranslator.h:409
DBTranslator(const DBTranslator< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
virtual std::size_t domainSize() const =0
returns the domain size of a variable corresponding to the translations
DBTranslator(DBTranslator< ALLOC > &&from)
move constructor
std::string operator>>(const DBTranslatedValue translated_val)
alias for method translateBack
DBTranslator(DBTranslatedValueType val_type, const bool is_lossless, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor
DBTranslator(DBTranslatedValueType val_type, const bool is_lossless, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without missing symbols
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder()=0
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
virtual bool hasEditableDictionary() const
indicates whether the translator has an editable dictionary or not
DBTranslator< ALLOC > & operator=(DBTranslator< ALLOC > &&from)
move operator
bool isMissingSymbol(const std::string &str) const
indicates whether a string corresponds to a missing symbol
virtual const Bijection< std::size_t, std::string, ALLOC< std::pair< std::size_t, std::string > > > & getDictionary() const
returns the translation from database indices to input strings
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
void setVariableDescription(const std::string &str) const
sets the name of the variable stored into the translator
DBTranslatedValueType getValType() const
returns the type of values handled by the translator
Set< std::string, ALLOC< std::string > > missing_symbols_
the set of missing symbols
Definition: DBTranslator.h:397
Bijection< std::size_t, std::string, ALLOC< std::pair< std::size_t, std::string > > > back_dico_
the bijection relating back translated values and their original strings.
Definition: DBTranslator.h:406