aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBTranslator.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The base class for all the tabular databases' cell translators
24  *
25  * This file contains the basis for interacting with a tabular database as
26  * a translator. Every translator should derive from this class.
27  *
28  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
29  */
30 #ifndef GUM_LEARNING_DB_TRANSLATOR_H
31 #define GUM_LEARNING_DB_TRANSLATOR_H
32 
33 #include <string>
34 #include <limits>
35 #include <vector>
36 #include <utility>
37 
38 #include <agrum/agrum.h>
39 #include <agrum/tools/core/bijection.h>
40 #include <agrum/tools/core/set.h>
41 #include <agrum/tools/variables/variable.h>
42 #include <agrum/tools/database/DBTranslatedValue.h>
43 
44 
45 namespace gum {
46 
47  namespace learning {
48 
49 
50  /** @class DBTranslator
51  * @headerfile DBTranslator.h <agrum/tools/database/DBTranslator.h>
52  * @brief The base class for all the tabular database cell translators
53  *
54  * Translators are used by DatabaseTable instances to transform datasets'
55  * strings into DBTranslatedValue instances. The point is that strings are
56  * not adequate for fast learning, they need to be preprocessed into a type
57  * that can be analyzed quickly (the so-called DBTranslatedValue type).
58  * The DBTranslator class is the abstract base class for all the translators
59  * used in aGrUM.
60  *
61  * Here is an example of how to use it, illustrated with the
62  * DBTranslator4ContinuousVariable class:
63  *
64  * @code
65  * // create the translator, with possible missing symbols: "N/A" and "???"
66  * // i.e., each time the translator reads a "N/A" or a "???" string, it
67  * // won't translate it into a number but into a missing value.
68  * std::vector<std::string> missing { "N/A", "???" };
69  * gum::learning::DBTranslator4ContinuousVariable<> translator ( missing );
70  *
71  * // gets the DBTranslatedValue corresponding to some strings
72  * auto val1 = translator.translate("5"); // val1 = DBTranslatedValue {5.0f}
73  * auto val2 = translator.translate("4.2"); // val2 = DBTRanslatedValue {4.2f}
74  * auto val3 = translator << "3.4"; // val3 = DBTranslatedValue {3.4f}
75  *
76  * // add the numbers assigned to val1, val2, val3
77  * float sum = val1.cont_val + val2.cont_val + val3.cont_val;
78  *
79  * // translate missing values: val4 and val5 will be equal to:
80  * // DBTranslatedValue { std::numeric_limits<float>::max () }
81  * auto val4 = translator << "N/A";
82  * auto val5 = translator.translate ( "???" );
83  *
84  * // the following instructions raise TypeError exceptions because the
85  * // strings cannot be translated into real numbers
86  * auto val6 = translator << "4.22x";
87  * auto val7 = translator.translate ( "xxx" );
88  *
89  * // given a DBTranslatedValue that is supposed to contain a float, get
90  * // the corresponding string. The strings should be equivalent to those
91  * // indicated below (maybe they could contain more zeroes after the dot).
92  * std::string str;
93  * str = translator.translateBack ( val1 ); // str ~ "5.0"
94  * str = translator >> val2; // str ~ "4.2"
95  * str = translator >> gum::learning::DBTranslatedValue {7.2e3f};
96  * // str ~ "7.2 e3"
97  *
98  * // translate back missing values: the string will corresponds to one of
99  * // the missing symbols known to the translator
100  * str = translator >> val4; // str = "N/A" or "???"
101  * str = translator >> val5; // str = "N/A" or "???"
102  *
103  * // get the domain size of the variable stored into the translatator
104  * // This size is only useful for translators with discrete variables
105  * std::size_t size = translator.domainSize ();
106  *
107  * // get the variable stored within the translator
108  * const gum::ContinuousVariable<float>* var =
109  * dynamic_cast<const gum::ContinuousVariable<float>*>
110  * ( translator.variable () );
111  *@endcode
112  *
113  * @ingroup learning_database
114  */
115  template < template < typename > class ALLOC = std::allocator >
117  public:
118  /// type for the allocators passed in arguments of methods
120 
121  // ##########################################################################
122  /// @name Constructors / Destructors
123  // ##########################################################################
124 
125  /// @{
126 
127  /// default constructor
128  /** @param val_type indicates whether the DBTranslator deals with discrete
129  * or continuous variables
130  * @param editable_dictionary indicates whether the dictionary used for
131  * translations can be updated dynamically when observing new string or
132  * whether it should remain constant. To see how this parameter is handled,
133  * see the child classes inheriting from DBTranslator
134  * @param missing_symbols the set of symbols in the database
135  * representing missing values
136  * @param max_dico_entries the max number of entries that the dictionary
137  * can contain. If we try to add new entries in the dictionary, this will
138  * be considered as an error and a SizeError exception will be raised
139  * @param alloc The allocator used to allocate memory for all the
140  * fields of the DBTranslator
141  */
142  template < template < typename > class XALLOC >
143  DBTranslator(
144  DBTranslatedValueType val_type,
145  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
146  const bool editable_dictionary = true,
147  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
148  const allocator_type& alloc = allocator_type());
149 
150  /// default constructor without missing symbols
151  /** @param val_type indicates whether the DBTranslator deals with discrete
152  * or continuous variables
153  * @param editable_dictionary indicates whether the dictionary used for
154  * translations can be updated dynamically when observing new string or
155  * whether it should remain constant. To see how this parameter is handled,
156  * see the child classes inheriting from DBTranslator
157  * @param max_dico_entries the max number of entries that the dictionary
158  * can contain. If we try to add new entries in the dictionary, this will
159  * be considered as an error and a SizeError exception will be raised
160  * @param alloc The allocator used to allocate memory for all the
161  * fields of the DBTranslator
162  */
163  DBTranslator(DBTranslatedValueType val_type,
164  const bool editable_dictionary = true,
165  std::size_t max_dico_entries
166  = std::numeric_limits< std::size_t >::max(),
167  const allocator_type& alloc = allocator_type());
168 
169  /// copy constructor
170  DBTranslator(const DBTranslator< ALLOC >& from);
171 
172  /// copy constructor with a given allocator
173  DBTranslator(const DBTranslator< ALLOC >& from, const allocator_type& alloc);
174 
175  /// move constructor
176  DBTranslator(DBTranslator< ALLOC >&& from);
177 
178  /// move constructor with a given allocator
179  DBTranslator(DBTranslator< ALLOC >&& from, const allocator_type& alloc);
180 
181  /// virtual copy constructor
182  virtual DBTranslator< ALLOC >* clone() const = 0;
183 
184  /// virtual copy constructor with a given allocator
185  virtual DBTranslator< ALLOC >* clone(const allocator_type& alloc) const = 0;
186 
187  /// destructor
188  virtual ~DBTranslator();
189 
190  /// @}
191 
192 
193  // ##########################################################################
194  /// @name Operators
195  // ##########################################################################
196 
197  /// @{
198 
199  /// alias for method translate
201 
202  /// alias for method translateBack
203  std::string operator>>(const DBTranslatedValue translated_val);
204 
205  /// @}
206 
207 
208  // ##########################################################################
209  /// @name Accessors / Modifiers
210  // ##########################################################################
211 
212  /// @{
213 
214  /// returns the translation of a string
215  /** This method tries to translate a given string into the
216  * DBTranslatedValue that should be stored into a DatabaseTable. If the
217  * translator cannot find the translation in its current dictionary, then
218  * two situations can
219  * obtain:
220  * -# if the translator is not in an editable dictionary mode, then the
221  * translator raises a NotFound exception.
222  * -# if the translator is in an editable dictionary mode, i.e., it is
223  * allowed to update its dictionary, then it tries to add the string
224  * as a new value in the dictionary. Upon success, it returns the
225  * translated value, otherwise, it raises either:
226  * - a SizeError exception if the number of entries in the dictionary
227  * has already reached its maximum,
228  * - a TypeError exception if the string cannot be converted into a
229  * value that can be inserted into the dictionary
230  * - an OperationNotAllowed exception if the translation would induce
231  * incoherent behavior (e.g., a DBTranslator4ContinuousVariable that
232  * contains a variable whose domain is [x,y] as well as a missing
233  * value symbol z \f$\in\f$ [x,y]).
234  *
235  * @warning Note that missing values (i.e., string encoded as missing
236  * symbols) are translated as std::numeric_limits<>::max ().
237  * @param str the string that the DBTranslator will try to translate
238  * @return the translated value of the string to be stored into a
239  * DatabaseTable
240  * @throws UnknownLabelInDatabase is raised if the translation cannot
241  * be found and the translator is not in an editable dictionary mode.
242  * @throws SizeError is raised if the number of entries in the dictionary
243  * has already reached its maximum.
244  * @throws OperationNotAllowed exception is raised if the translation
245  * cannot be found and the insertion of the string into the translator's
246  * dictionary fails because it would induce incoherent behavior (e.g.,
247  * a DBTranslator4ContinuousVariable that contains a variable whose domain
248  * is [x,y] as well as a missing value symbol z \f$\in\f$ [x,y]).
249  * @throws TypeError is raised if the translation cannot be found and
250  * the insertion of the string into the translator's dictionary fails
251  * due to str being impossible to be converted into an appropriate type. */
252  virtual DBTranslatedValue translate(const std::string& str) = 0;
253 
254  /// returns the original value for a given translation
255  /** @param translated_val a value that should result from a translation
256  * and for which we are looking for the corresponding DBTranslator's
257  * variable's label (a string)
258  * @return the string that was translated into a given DBTranslatedValue.
259  * @warning when the translator is not a proper bijection, like, e.g.,
260  * DBTranslator4DiscretizedVariable, the method returns the value of
261  * the random variable corresponding to translated_val (i.e., for a
262  * discretized variable, it would return the interval corresponding to
263  * translated_val).
264  * @throws UnknownLabelInDatabase is raised if this original value
265  * cannot be found */
266  virtual std::string
267  translateBack(const DBTranslatedValue translated_val) const = 0;
268 
269  /// returns the domain size of a variable corresponding to the translations
270  /** Assume that the translator has been fed with the observed values of
271  * a random variable. Then it has produced a set of translated values. The
272  * latter define the domain of the variable. When the variable is discrete,
273  * values are assumed to span from 0 to a number n-1. In this case, the
274  * domain size of the variable is n. When the function is continuous,
275  * the domain size should be infinite and we return a
276  * std::numeric_limits<std::size_t>::max() to represent it. Note that
277  * missing values are encoded as std::numeric_limits<>::max () and are
278  * not taken into account in the domain sizes. */
279  virtual std::size_t domainSize() const = 0;
280 
281  /// indicates whether the translator has an editable dictionary or not
282  virtual bool hasEditableDictionary() const;
283 
284  /// sets/unset the editable dictionary mode
285  virtual void setEditableDictionaryMode(bool new_mode);
286 
287  /** @brief indicates whether a reordering is needed to make the
288  * translations sorted
289  *
290  * If the strings represented by the translations are only numbers,
291  * translations are considered to be sorted if and only if they are sorted
292  * by increasing number. If the strings do not only represent numbers, then
293  * translations are considered to be sorted if and only if they are sorted
294  * lexicographically.
295  *
296  * When constructing dynamically its dictionary, the translator may
297  * assign wrong DBTranslatedValue values to strings. For instance, a
298  * translator reading sequentially integer strings 4, 1, 3, may map
299  * 4 into DBTranslatedValue{std::size_t(0)},
300  * 1 into DBTranslatedValue{std::size_t(1)} and
301  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
302  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
303  * a domain specified with increasing values. This requires a
304  * reordering. Method needsReodering() returns a Boolean indicating
305  * whether such a reordering should be performed or whether the current
306  * order is OK. */
307  virtual bool needsReordering() const = 0;
308 
309  /** @brief performs a reordering of the dictionary and returns a mapping
310  * from the old translated values to the new ones.
311  *
312  * When a reordering is needed, i.e., string values must be translated
313  * differently, Method reorder() computes how the translations should be
314  * changed. It updates accordingly the dictionary and returns the mapping
315  * that enables changing the old dictionary values into the new ones.
316  * Note that the hash table returned is expressed in terms of std::size_t
317  * because only the translations for discrete random variables need be
318  * reordered, those for continuous random variables are identity mappings.
319  * @warning If there is no reordering to perform, the method returns
320  * an empty hashtable. */
321  virtual HashTable< std::size_t,
322  std::size_t,
323  ALLOC< std::pair< std::size_t, std::size_t > > >
324  reorder() = 0;
325 
326  /// returns the set of missing symbols taken into account by the translator
327  const Set< std::string, ALLOC< std::string > >& missingSymbols() const;
328 
329  /// indicates whether a string corresponds to a missing symbol
330  bool isMissingSymbol(const std::string& str) const;
331 
332  /// returns the variable stored into the translator
333  virtual const Variable* variable() const = 0;
334 
335  /// sets the name of the variable stored into the translator
336  void setVariableName(const std::string& str) const;
337 
338  /// sets the name of the variable stored into the translator
339  void setVariableDescription(const std::string& str) const;
340 
341  /// returns the type of values handled by the translator
342  /** @returns either DBTranslatedValueType::DISCRETE if the translator
343  * includes a discrete variable or DBTranslatedValueType::CONTINUOUS if
344  * it contains a continuous variable. This is convenient to know how to
345  * interpret the DBTranslatedValue instances produced by the DBTranslator:
346  * either using their discr_val field or their cont_val field. */
348 
349  /// returns the allocator used by the translator
351 
352  /// indicates whether a translated value corresponds to a missing value
353  bool isMissingValue(const DBTranslatedValue& val) const;
354 
355  /// returns the translation of a missing value
356  virtual DBTranslatedValue missingValue() const = 0;
357 
358  /// @}
359 
360 
361  protected:
362  // ##########################################################################
363  /// @name Protected Operators
364  // ##########################################################################
365 
366  /// @{
367 
368  /// copy operator
370 
371  /// move operator
373 
374  /// @}
375 
376 
377  /// indicates whether the dictionary can be updated or not
379 
380  /// the maximum number of entries that the dictionary is allowed to contain
381  std::size_t max_dico_entries_;
382 
383  /// the set of missing symbols
385 
386  /// the bijection relating back translated values and their original strings.
387  /** Note that the translated values considered here are of type std::size_t
388  * because only the values for discrete variables need be stored, those
389  * for continuous variables are actually identity mappings.
390  * @warning only the values of the random variable are stored into this
391  * bijection. Missing values are not considered here. */
392  mutable Bijection< std::size_t,
393  std::string,
394  ALLOC< std::pair< float, std::string > > >
396 
397  /// the type of the values translated by the translator
399  };
400 
401 
402  } /* namespace learning */
403 
404 } /* namespace gum */
405 
406 // always include the template implementation
407 #include <agrum/tools/database/DBTranslator_tpl.h>
408 
409 #endif /* GUM_LEARNING_DB_TRANSLATOR_H */
virtual DBTranslator< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
DBTranslatedValue operator<<(const std::string &str)
alias for method translate
virtual DBTranslatedValue missingValue() const =0
returns the translation of a missing value
virtual DBTranslatedValue translate(const std::string &str)=0
returns the translation of a string
virtual std::string translateBack(const DBTranslatedValue translated_val) const =0
returns the original value for a given translation
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
std::size_t max_dico_entries_
the maximum number of entries that the dictionary is allowed to contain
Definition: DBTranslator.h:381
void setVariableName(const std::string &str) const
sets the name of the variable stored into the translator
virtual const Variable * variable() const =0
returns the variable stored into the translator
DBTranslator(DBTranslatedValueType val_type, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor
virtual DBTranslator< ALLOC > * clone() const =0
virtual copy constructor
virtual ~DBTranslator()
destructor
virtual bool needsReordering() const =0
indicates whether a reordering is needed to make the translations sorted
const Set< std::string, ALLOC< std::string > > & missingSymbols() const
returns the set of missing symbols taken into account by the translator
DBTranslator(const DBTranslator< ALLOC > &from)
copy constructor
allocator_type getAllocator() const
returns the allocator used by the translator
bool isMissingValue(const DBTranslatedValue &val) const
indicates whether a translated value corresponds to a missing value
DBTranslator(DBTranslator< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
The base class for all the tabular database cell translators.
Definition: DBTranslator.h:116
DBTranslator< ALLOC > & operator=(const DBTranslator< ALLOC > &from)
copy operator
virtual void setEditableDictionaryMode(bool new_mode)
sets/unset the editable dictionary mode
bool is_dictionary_dynamic_
indicates whether the dictionary can be updated or not
Definition: DBTranslator.h:378
DBTranslatedValueType val_type_
the type of the values translated by the translator
Definition: DBTranslator.h:398
DBTranslator(const DBTranslator< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
virtual std::size_t domainSize() const =0
returns the domain size of a variable corresponding to the translations
DBTranslator(DBTranslator< ALLOC > &&from)
move constructor
DBTranslator(DBTranslatedValueType val_type, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without missing symbols
Bijection< std::size_t, std::string, ALLOC< std::pair< float, std::string > > > back_dico_
the bijection relating back translated values and their original strings.
Definition: DBTranslator.h:395
std::string operator>>(const DBTranslatedValue translated_val)
alias for method translateBack
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder()=0
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
virtual bool hasEditableDictionary() const
indicates whether the translator has an editable dictionary or not
DBTranslator< ALLOC > & operator=(DBTranslator< ALLOC > &&from)
move operator
bool isMissingSymbol(const std::string &str) const
indicates whether a string corresponds to a missing symbol
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
void setVariableDescription(const std::string &str) const
sets the name of the variable stored into the translator
DBTranslatedValueType getValType() const
returns the type of values handled by the translator
Set< std::string, ALLOC< std::string > > missing_symbols_
the set of missing symbols
Definition: DBTranslator.h:384