aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBTranslator.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The base class for all the tabular databases' cell translators
24  *
25  * This file contains the basis for interacting with a tabular database as
26  * a translator. Every translator should derive from this class.
27  *
28  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
29  */
30 #ifndef GUM_LEARNING_DB_TRANSLATOR_H
31 #define GUM_LEARNING_DB_TRANSLATOR_H
32 
33 #include <string>
34 #include <limits>
35 #include <vector>
36 #include <utility>
37 
38 #include <agrum/agrum.h>
39 #include <agrum/tools/core/bijection.h>
40 #include <agrum/tools/core/set.h>
41 #include <agrum/tools/variables/variable.h>
42 #include <agrum/tools/database/DBTranslatedValue.h>
43 
44 
45 namespace gum {
46 
47  namespace learning {
48 
49 
50  /** @class DBTranslator
51  * @headerfile DBTranslator.h <agrum/tools/database/DBTranslator.h>
52  * @brief The base class for all the tabular database cell translators
53  *
54  * Translators are used by DatabaseTable instances to transform datasets'
55  * strings into DBTranslatedValue instances. The point is that strings are
56  * not adequate for fast learning, they need to be preprocessed into a type
57  * that can be analyzed quickly (the so-called DBTranslatedValue type).
58  * The DBTranslator class is the abstract base class for all the translators
59  * used in aGrUM.
60  *
61  * Here is an example of how to use it, illustrated with the
62  * DBTranslator4ContinuousVariable class:
63  *
64  * @code
65  * // create the translator, with possible missing symbols: "N/A" and "???"
66  * // i.e., each time the translator reads a "N/A" or a "???" string, it
67  * // won't translate it into a number but into a missing value.
68  * std::vector<std::string> missing { "N/A", "???" };
69  * gum::learning::DBTranslator4ContinuousVariable<> translator ( missing );
70  *
71  * // gets the DBTranslatedValue corresponding to some strings
72  * auto val1 = translator.translate("5"); // val1 = DBTranslatedValue {5.0f}
73  * auto val2 = translator.translate("4.2"); // val2 = DBTRanslatedValue {4.2f}
74  * auto val3 = translator << "3.4"; // val3 = DBTranslatedValue {3.4f}
75  *
76  * // add the numbers assigned to val1, val2, val3
77  * float sum = val1.cont_val + val2.cont_val + val3.cont_val;
78  *
79  * // translate missing values: val4 and val5 will be equal to:
80  * // DBTranslatedValue { std::numeric_limits<float>::max () }
81  * auto val4 = translator << "N/A";
82  * auto val5 = translator.translate ( "???" );
83  *
84  * // the following instructions raise TypeError exceptions because the
85  * // strings cannot be translated into real numbers
86  * auto val6 = translator << "4.22x";
87  * auto val7 = translator.translate ( "xxx" );
88  *
89  * // given a DBTranslatedValue that is supposed to contain a float, get
90  * // the corresponding string. The strings should be equivalent to those
91  * // indicated below (maybe they could contain more zeroes after the dot).
92  * std::string str;
93  * str = translator.translateBack ( val1 ); // str ~ "5.0"
94  * str = translator >> val2; // str ~ "4.2"
95  * str = translator >> gum::learning::DBTranslatedValue {7.2e3f};
96  * // str ~ "7.2 e3"
97  *
98  * // translate back missing values: the string will corresponds to one of
99  * // the missing symbols known to the translator
100  * str = translator >> val4; // str = "N/A" or "???"
101  * str = translator >> val5; // str = "N/A" or "???"
102  *
103  * // get the domain size of the variable stored into the translatator
104  * // This size is only useful for translators with discrete variables
105  * std::size_t size = translator.domainSize ();
106  *
107  * // get the variable stored within the translator
108  * const gum::ContinuousVariable<float>* var =
109  * dynamic_cast<const gum::ContinuousVariable<float>*>
110  * ( translator.variable () );
111  *@endcode
112  *
113  * @ingroup learning_database
114  */
115  template < template < typename > class ALLOC = std::allocator >
117  public:
118  /// type for the allocators passed in arguments of methods
120 
121  // ##########################################################################
122  /// @name Constructors / Destructors
123  // ##########################################################################
124 
125  /// @{
126 
127  /// default constructor
128  /** @param val_type indicates whether the DBTranslator deals with discrete
129  * or continuous variables
130  * @param editable_dictionary indicates whether the dictionary used for
131  * translations can be updated dynamically when observing new string or
132  * whether it should remain constant. To see how this parameter is handled,
133  * see the child classes inheriting from DBTranslator
134  * @param missing_symbols the set of symbols in the database
135  * representing missing values
136  * @param max_dico_entries the max number of entries that the dictionary
137  * can contain. If we try to add new entries in the dictionary, this will
138  * be considered as an error and a SizeError exception will be raised
139  * @param alloc The allocator used to allocate memory for all the
140  * fields of the DBTranslator
141  */
142  template < template < typename > class XALLOC >
143  DBTranslator(DBTranslatedValueType val_type,
144  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
145  const bool editable_dictionary = true,
146  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
147  const allocator_type& alloc = allocator_type());
148 
149  /// default constructor without missing symbols
150  /** @param val_type indicates whether the DBTranslator deals with discrete
151  * or continuous variables
152  * @param editable_dictionary indicates whether the dictionary used for
153  * translations can be updated dynamically when observing new string or
154  * whether it should remain constant. To see how this parameter is handled,
155  * see the child classes inheriting from DBTranslator
156  * @param max_dico_entries the max number of entries that the dictionary
157  * can contain. If we try to add new entries in the dictionary, this will
158  * be considered as an error and a SizeError exception will be raised
159  * @param alloc The allocator used to allocate memory for all the
160  * fields of the DBTranslator
161  */
162  DBTranslator(DBTranslatedValueType val_type,
163  const bool editable_dictionary = true,
164  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
165  const allocator_type& alloc = allocator_type());
166 
167  /// copy constructor
168  DBTranslator(const DBTranslator< ALLOC >& from);
169 
170  /// copy constructor with a given allocator
171  DBTranslator(const DBTranslator< ALLOC >& from, const allocator_type& alloc);
172 
173  /// move constructor
174  DBTranslator(DBTranslator< ALLOC >&& from);
175 
176  /// move constructor with a given allocator
177  DBTranslator(DBTranslator< ALLOC >&& from, const allocator_type& alloc);
178 
179  /// virtual copy constructor
180  virtual DBTranslator< ALLOC >* clone() const = 0;
181 
182  /// virtual copy constructor with a given allocator
183  virtual DBTranslator< ALLOC >* clone(const allocator_type& alloc) const = 0;
184 
185  /// destructor
186  virtual ~DBTranslator();
187 
188  /// @}
189 
190 
191  // ##########################################################################
192  /// @name Operators
193  // ##########################################################################
194 
195  /// @{
196 
197  /// alias for method translate
199 
200  /// alias for method translateBack
201  std::string operator>>(const DBTranslatedValue translated_val);
202 
203  /// @}
204 
205 
206  // ##########################################################################
207  /// @name Accessors / Modifiers
208  // ##########################################################################
209 
210  /// @{
211 
212  /// returns the translation of a string
213  /** This method tries to translate a given string into the
214  * DBTranslatedValue that should be stored into a DatabaseTable. If the
215  * translator cannot find the translation in its current dictionary, then
216  * two situations can
217  * obtain:
218  * -# if the translator is not in an editable dictionary mode, then the
219  * translator raises a NotFound exception.
220  * -# if the translator is in an editable dictionary mode, i.e., it is
221  * allowed to update its dictionary, then it tries to add the string
222  * as a new value in the dictionary. Upon success, it returns the
223  * translated value, otherwise, it raises either:
224  * - a SizeError exception if the number of entries in the dictionary
225  * has already reached its maximum,
226  * - a TypeError exception if the string cannot be converted into a
227  * value that can be inserted into the dictionary
228  * - an OperationNotAllowed exception if the translation would induce
229  * incoherent behavior (e.g., a DBTranslator4ContinuousVariable that
230  * contains a variable whose domain is [x,y] as well as a missing
231  * value symbol z \f$\in\f$ [x,y]).
232  *
233  * @warning Note that missing values (i.e., string encoded as missing
234  * symbols) are translated as std::numeric_limits<>::max ().
235  * @param str the string that the DBTranslator will try to translate
236  * @return the translated value of the string to be stored into a
237  * DatabaseTable
238  * @throws UnknownLabelInDatabase is raised if the translation cannot
239  * be found and the translator is not in an editable dictionary mode.
240  * @throws SizeError is raised if the number of entries in the dictionary
241  * has already reached its maximum.
242  * @throws OperationNotAllowed exception is raised if the translation
243  * cannot be found and the insertion of the string into the translator's
244  * dictionary fails because it would induce incoherent behavior (e.g.,
245  * a DBTranslator4ContinuousVariable that contains a variable whose domain
246  * is [x,y] as well as a missing value symbol z \f$\in\f$ [x,y]).
247  * @throws TypeError is raised if the translation cannot be found and
248  * the insertion of the string into the translator's dictionary fails
249  * due to str being impossible to be converted into an appropriate type. */
250  virtual DBTranslatedValue translate(const std::string& str) = 0;
251 
252  /// returns the original value for a given translation
253  /** @param translated_val a value that should result from a translation
254  * and for which we are looking for the corresponding DBTranslator's
255  * variable's label (a string)
256  * @return the string that was translated into a given DBTranslatedValue.
257  * @warning when the translator is not a proper bijection, like, e.g.,
258  * DBTranslator4DiscretizedVariable, the method returns the value of
259  * the random variable corresponding to translated_val (i.e., for a
260  * discretized variable, it would return the interval corresponding to
261  * translated_val).
262  * @throws UnknownLabelInDatabase is raised if this original value
263  * cannot be found */
264  virtual std::string translateBack(const DBTranslatedValue translated_val) const = 0;
265 
266  /// returns the domain size of a variable corresponding to the translations
267  /** Assume that the translator has been fed with the observed values of
268  * a random variable. Then it has produced a set of translated values. The
269  * latter define the domain of the variable. When the variable is discrete,
270  * values are assumed to span from 0 to a number n-1. In this case, the
271  * domain size of the variable is n. When the function is continuous,
272  * the domain size should be infinite and we return a
273  * std::numeric_limits<std::size_t>::max() to represent it. Note that
274  * missing values are encoded as std::numeric_limits<>::max () and are
275  * not taken into account in the domain sizes. */
276  virtual std::size_t domainSize() const = 0;
277 
278  /// indicates whether the translator has an editable dictionary or not
279  virtual bool hasEditableDictionary() const;
280 
281  /// sets/unset the editable dictionary mode
282  virtual void setEditableDictionaryMode(bool new_mode);
283 
284  /** @brief indicates whether a reordering is needed to make the
285  * translations sorted
286  *
287  * If the strings represented by the translations are only numbers,
288  * translations are considered to be sorted if and only if they are sorted
289  * by increasing number. If the strings do not only represent numbers, then
290  * translations are considered to be sorted if and only if they are sorted
291  * lexicographically.
292  *
293  * When constructing dynamically its dictionary, the translator may
294  * assign wrong DBTranslatedValue values to strings. For instance, a
295  * translator reading sequentially integer strings 4, 1, 3, may map
296  * 4 into DBTranslatedValue{std::size_t(0)},
297  * 1 into DBTranslatedValue{std::size_t(1)} and
298  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
299  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
300  * a domain specified with increasing values. This requires a
301  * reordering. Method needsReodering() returns a Boolean indicating
302  * whether such a reordering should be performed or whether the current
303  * order is OK. */
304  virtual bool needsReordering() const = 0;
305 
306  /** @brief performs a reordering of the dictionary and returns a mapping
307  * from the old translated values to the new ones.
308  *
309  * When a reordering is needed, i.e., string values must be translated
310  * differently, Method reorder() computes how the translations should be
311  * changed. It updates accordingly the dictionary and returns the mapping
312  * that enables changing the old dictionary values into the new ones.
313  * Note that the hash table returned is expressed in terms of std::size_t
314  * because only the translations for discrete random variables need be
315  * reordered, those for continuous random variables are identity mappings.
316  * @warning If there is no reordering to perform, the method returns
317  * an empty hashtable. */
318  virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
319  reorder() = 0;
320 
321  /// returns the set of missing symbols taken into account by the translator
322  const Set< std::string, ALLOC< std::string > >& missingSymbols() const;
323 
324  /// indicates whether a string corresponds to a missing symbol
325  bool isMissingSymbol(const std::string& str) const;
326 
327  /// returns the variable stored into the translator
328  virtual const Variable* variable() const = 0;
329 
330  /// sets the name of the variable stored into the translator
331  void setVariableName(const std::string& str) const;
332 
333  /// sets the name of the variable stored into the translator
334  void setVariableDescription(const std::string& str) const;
335 
336  /// returns the type of values handled by the translator
337  /** @returns either DBTranslatedValueType::DISCRETE if the translator
338  * includes a discrete variable or DBTranslatedValueType::CONTINUOUS if
339  * it contains a continuous variable. This is convenient to know how to
340  * interpret the DBTranslatedValue instances produced by the DBTranslator:
341  * either using their discr_val field or their cont_val field. */
343 
344  /// returns the allocator used by the translator
346 
347  /// indicates whether a translated value corresponds to a missing value
348  bool isMissingValue(const DBTranslatedValue& val) const;
349 
350  /// returns the translation of a missing value
351  virtual DBTranslatedValue missingValue() const = 0;
352 
353  /// @}
354 
355 
356  protected:
357  // ##########################################################################
358  /// @name Protected Operators
359  // ##########################################################################
360 
361  /// @{
362 
363  /// copy operator
365 
366  /// move operator
368 
369  /// @}
370 
371 
372  /// indicates whether the dictionary can be updated or not
374 
375  /// the maximum number of entries that the dictionary is allowed to contain
376  std::size_t max_dico_entries_;
377 
378  /// the set of missing symbols
380 
381  /// the bijection relating back translated values and their original strings.
382  /** Note that the translated values considered here are of type std::size_t
383  * because only the values for discrete variables need be stored, those
384  * for continuous variables are actually identity mappings.
385  * @warning only the values of the random variable are stored into this
386  * bijection. Missing values are not considered here. */
387  mutable Bijection< std::size_t, std::string, ALLOC< std::pair< float, std::string > > >
389 
390  /// the type of the values translated by the translator
392  };
393 
394 
395  } /* namespace learning */
396 
397 } /* namespace gum */
398 
399 // always include the template implementation
400 #include <agrum/tools/database/DBTranslator_tpl.h>
401 
402 #endif /* GUM_LEARNING_DB_TRANSLATOR_H */
virtual DBTranslator< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
DBTranslatedValue operator<<(const std::string &str)
alias for method translate
virtual DBTranslatedValue missingValue() const =0
returns the translation of a missing value
virtual DBTranslatedValue translate(const std::string &str)=0
returns the translation of a string
virtual std::string translateBack(const DBTranslatedValue translated_val) const =0
returns the original value for a given translation
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
std::size_t max_dico_entries_
the maximum number of entries that the dictionary is allowed to contain
Definition: DBTranslator.h:376
void setVariableName(const std::string &str) const
sets the name of the variable stored into the translator
virtual const Variable * variable() const =0
returns the variable stored into the translator
DBTranslator(DBTranslatedValueType val_type, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor
virtual DBTranslator< ALLOC > * clone() const =0
virtual copy constructor
virtual ~DBTranslator()
destructor
virtual bool needsReordering() const =0
indicates whether a reordering is needed to make the translations sorted
const Set< std::string, ALLOC< std::string > > & missingSymbols() const
returns the set of missing symbols taken into account by the translator
DBTranslator(const DBTranslator< ALLOC > &from)
copy constructor
allocator_type getAllocator() const
returns the allocator used by the translator
bool isMissingValue(const DBTranslatedValue &val) const
indicates whether a translated value corresponds to a missing value
DBTranslator(DBTranslator< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
The base class for all the tabular database cell translators.
Definition: DBTranslator.h:116
DBTranslator< ALLOC > & operator=(const DBTranslator< ALLOC > &from)
copy operator
virtual void setEditableDictionaryMode(bool new_mode)
sets/unset the editable dictionary mode
bool is_dictionary_dynamic_
indicates whether the dictionary can be updated or not
Definition: DBTranslator.h:373
DBTranslatedValueType val_type_
the type of the values translated by the translator
Definition: DBTranslator.h:391
DBTranslator(const DBTranslator< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
virtual std::size_t domainSize() const =0
returns the domain size of a variable corresponding to the translations
DBTranslator(DBTranslator< ALLOC > &&from)
move constructor
DBTranslator(DBTranslatedValueType val_type, const bool editable_dictionary=true, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without missing symbols
Bijection< std::size_t, std::string, ALLOC< std::pair< float, std::string > > > back_dico_
the bijection relating back translated values and their original strings.
Definition: DBTranslator.h:388
std::string operator>>(const DBTranslatedValue translated_val)
alias for method translateBack
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder()=0
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
virtual bool hasEditableDictionary() const
indicates whether the translator has an editable dictionary or not
DBTranslator< ALLOC > & operator=(DBTranslator< ALLOC > &&from)
move operator
bool isMissingSymbol(const std::string &str) const
indicates whether a string corresponds to a missing symbol
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
void setVariableDescription(const std::string &str) const
sets the name of the variable stored into the translator
DBTranslatedValueType getValType() const
returns the type of values handled by the translator
Set< std::string, ALLOC< std::string > > missing_symbols_
the set of missing symbols
Definition: DBTranslator.h:379