aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBTranslator4RangeVariable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for range variables
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/core/hashTable.h>
32 #include <agrum/tools/core/set.h>
33 #include <agrum/tools/database/DBTranslator.h>
34 #include <agrum/tools/variables/rangeVariable.h>
35 
36 
37 namespace gum {
38 
39  namespace learning {
40 
41 
42  /** @class DBTranslator4RangeVariable
43  * @headerfile DBTranslator4RangeVariable.h <agrum/tools/database/DBTranslator4RangeVariable.h>
44  * @brief The databases' cell translators for range variables
45  *
46  * Translators are used by DatabaseTable instances to transform datasets'
47  * strings into DBTranslatedValue instances. The point is that strings are
48  * not adequate for fast learning, they need to be preprocessed into a type
49  * that can be analyzed quickly (the so-called DBTranslatedValue type).
50  *
51  * A DBTranslator4RangeVariable is a translator that contains and
52  * exploits a RangeVariable for translations. Each time a string needs
53  * be translated, we ask the RangeVariable whether its domain contains
54  * the integer value represented in the string. If this is the case, then
55  * the DBTranslatedValue corresponding to the translation of the string
56  * contains in its discr_val field this integer value.
57  *
58  * @par Here is an example of how to use this class:
59  * @code
60  * // create the translator, with possible missing symbols: "N/A" and "???"
61  * // i.e., each time the translator reads a "N/A" or a "???" string, it
62  * // won't translate it into a number but into a missing value.
63  * std::vector<std::string> missing { "N/A", "???" };
64  * gum::learning::DBTranslator4RangeVariable<> translator ( missing );
65  *
66  * // gets the DBTranslatedValue corresponding to some strings
67  * auto val1 = translator.translate("5");
68  * auto val2 = translator.translate("4");
69  * // at this point, val1 and val2 are equal to
70  * // gum::learning::DBTranslatedValue { std::size_t(0) } and
71  * // gum::learning::DBTranslatedValue { std::size_t(1) } respectively.
72  * // In addition, the RangeVariable stored into the translator has
73  * // a domain equal to {4,5}.
74  * auto val3 = translator << "7";
75  * // val3 is encoded as gum::learning::DBTranslatedValue { std::size_t(3) }
76  * // because string "6" is implicitly encoded as
77  * // gum::learning::DBTranslatedValue { std::size_t(3) }.
78  * // In addition, the domain of the range variable is expanded to {4,5,6,7}.
79  *
80  * // add the numbers assigned to val1, val2, val3
81  * std::size_t sum = val1.discr_val + val2.discr_val + val3.discr_val;
82  *
83  * // translate missing values: val4 and val5 will be equal to:
84  * // DBTranslatedValue { std::numeric_limits<float>::max () }
85  * auto val4 = translator << "N/A";
86  * auto val5 = translator.translate ( "???" );
87  *
88  * // the following instructions raise TypeError exceptions because the
89  * // strings cannot be translated into integers
90  * auto val6 = translator << "422x";
91  * auto val7 = translator.translate ( "xxx" );
92  *
93  * // given a DBTranslatedValue that is supposed to contain an integer in
94  * // the range of the RangeVariable, get the corresponding string.
95  * std::string str;
96  * str = translator.translateBack ( val1 ); // str = "5"
97  * str = translator >> val2; // str = "4"
98  * str = translator >> gum::learning::DBTranslatedValue {std::size_t(2)};
99  * // str = "6"
100  *
101  * // translate back missing values: the string will corresponds to one of
102  * // the missing symbols known to the translator
103  * str = translator >> val4; // str = "N/A" or "???"
104  * str = translator >> val5; // str = "N/A" or "???"
105  *
106  * // get the variable stored within the translator
107  * const gum::RangeVariable* var =
108  * dynamic_cast<const gum::RangeVariable*> ( translator.variable () );
109  *
110  * // it is possible to create a translator for an already known variable.
111  * // In this case, by default, the translator is not in editable mode, but
112  * // this behavior can be changed passing the right arguments to the
113  * // constructor of the translator, or using the setEditableDictionaryMode
114  * // method. Here, we create a range variable whose domain is {-2,...,10}
115  * gum::RangeVariable var ( "X", "", -2, 10 );
116  * gum::learning::DBTranslator4RangeVariable<> translator2 ( var, missing );
117  *
118  * auto xval1 = translator2.translate ( "-1" ).discr_val; // xval1 = 1
119  * auto xval2 = translator2.translate ( "7" ).discr_val; // xval2 = 9
120  * auto xval3 = translator2.translate ( "N/A" ).discr_val;
121  * // here xval3 corresponds to a missing value, hence it is equal to
122  * // std::numeric_limits<size_t>::max ()
123  *
124  * // trying to translate a string which is outside the domain of var will
125  * // raise Exception NotFound
126  * translator2.translate ( "20" ); // NotFound
127  * @endcode
128  *
129  * @ingroup learning_database
130  */
131  template < template < typename > class ALLOC = std::allocator >
133  public:
134  /// type for the allocators passed in arguments of methods
136 
137 
138  // ##########################################################################
139  /// @name Constructors / Destructors
140  // ##########################################################################
141 
142  /// @{
143 
144  /// default constructor without any initial variable
145  /** When using this constructor, it is assumed implicitly that the
146  * dictionary contained into the translator is editable. So, when reading
147  * the database, if we observe a value that has not been encountered
148  * before, we update the range of the dictionary of the translator (hence
149  * that of the variable contained by the translator).
150  * @param missing_symbols the set of symbols in the dataset
151  * representing missing values
152  * @param max_dico_entries the max number of entries that the dictionary
153  * can contain. If we try to add new entries in the dictionary, this will
154  * be considered as an error and a SizeError exception will be raised
155  * @param alloc The allocator used to allocate memory for all the
156  * fields of the DBTranslator4RangeVariable
157  */
158  template < template < typename > class XALLOC >
160  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
162  const allocator_type& alloc = allocator_type());
163 
164  /// default constructor without any initial variable nor missing symbols
165  /** When using this constructor, it is assumed implicitly that the
166  * dictionary contained into the translator is editable. So, when reading
167  * the database, if we observe a value that has not been encountered
168  * before, we update the range of the dictionary of the translator (hence
169  * that of the variable contained by the translator).
170  * @param max_dico_entries the max number of entries that the dictionary
171  * can contain. If we try to add new entries in the dictionary, this will
172  * be considered as an error and a SizeError exception will be raised
173  * @param alloc The allocator used to allocate memory for all the
174  * fields of the DBTranslator4RangeVariable
175  */
176  DBTranslator4RangeVariable(std::size_t max_dico_entries
177  = std::numeric_limits< std::size_t >::max(),
178  const allocator_type& alloc = allocator_type());
179 
180  /// default constructor with a range variable as translator
181  /** @param var a range variable which will be used for translations.
182  * The translator keeps a copy of this variable
183  * @param missing_symbols the set of symbols in the dataset
184  * representing missing values
185  * @param editable_dictionary the mode in which the translator will perform
186  * translations: when false (the default), the translation of a string
187  * that does not correspond to an integer within the range of var will
188  * raise a NotFound exception; when true, the translator will try to
189  * expand the domain of the RangeVariable so that the number represented in
190  * the string belongs to this domain ((and therefore to the dictionary)
191  * @param max_dico_entries the max number of entries that the dictionary
192  * can contain. If we try to add new entries in the dictionary, this will
193  * be considered as an error and a SizeError exception will be raised
194  * @param alloc The allocator used to allocate memory for all the
195  * fields of the DBTranslator4RangeVariable
196  * @warning If the variable contained into the translator has a value in
197  * the range that is equal to a missing value symbol, the range value will
198  * be taken into account in the translations, not the missing value.
199  */
200  template < template < typename > class XALLOC >
202  const RangeVariable& var,
203  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
204  const bool editable_dictionary = false,
206  const allocator_type& alloc = allocator_type());
207 
208  /** @brief default constructor with a range variable as translator
209  * but without missing symbols
210  *
211  * @param var a range variable which will be used for translations.
212  * The translator keeps a copy of this variable
213  * @param editable_dictionary the mode in which the translator will perform
214  * translations: when false (the default), the translation of a string
215  * that does not correspond to an integer within the range of var will
216  * raise a NotFound exception; when true, the translator will try to
217  * expand the domain of the RangeVariable so that the number represented in
218  * the string belongs to this domain ((and therefore to the dictionary)
219  * @param max_dico_entries the max number of entries that the dictionary
220  * can contain. If we try to add new entries in the dictionary, this will
221  * be considered as an error and a SizeError exception will be raised
222  * @param alloc The allocator used to allocate memory for all the
223  * fields of the DBTranslator4RangeVariable
224  * @warning If the variable contained into the translator has a value in
225  * the range that is equal to a missing value symbol, the range value will
226  * be taken into account in the translations, not the missing value.
227  */
228  DBTranslator4RangeVariable(const RangeVariable& var,
229  const bool editable_dictionary = false,
230  std::size_t max_dico_entries
231  = std::numeric_limits< std::size_t >::max(),
232  const allocator_type& alloc = allocator_type());
233 
234  /// copy constructor
235  DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from);
236 
237  /// copy constructor with a given translator
238  DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC >& from,
239  const allocator_type& alloc);
240 
241  /// move constructor
242  DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from);
243 
244  /// move constructor with a given allocator
245  DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC >&& from,
246  const allocator_type& alloc);
247 
248  /// virtual copy constructor
249  virtual DBTranslator4RangeVariable< ALLOC >* clone() const;
250 
251  /// virtual copy constructor with a given allocator
252  virtual DBTranslator4RangeVariable< ALLOC >* clone(const allocator_type& alloc) const;
253 
254  /// destructor
255  virtual ~DBTranslator4RangeVariable();
256 
257  /// @}
258 
259 
260  // ##########################################################################
261  /// @name Operators
262  // ##########################################################################
263 
264  /// @{
265 
266  /// copy operator
269 
270  /// move operator
272 
273  /// @}
274 
275 
276  // ##########################################################################
277  /// @name Accessors / Modifiers
278  // ##########################################################################
279 
280  /// @{
281 
282  /// returns the translation of a string
283  /** This method tries to translate a given string into the
284  * DBTranslatedValue that should be stored into a databaseTable. If the
285  * translator cannot find the translation in its current dictionary, then
286  * two situations can obtain:
287  * -# if the translator is not in an editable dictionary mode, then the
288  * translator raises a NotFound exception.
289  * -# if the translator is in an editable dictionary mode, i.e., it is
290  * allowed to update its dictionary, then it tries to update the range
291  * of its dictionary to include the new value. Upon success, it returns
292  * the translated value, otherwise, it raises either:
293  * - a TypeError exception if the string cannot be converted into a
294  * value that can be inserted into the dictionary
295  * - an OperationNotAllowed exception if the translation would induce
296  * incoherent behavior (e.g., a translator that
297  * contains a variable whose domain is [x,y] as well as a missing
298  * value symbol z \f$\in\f$ [x,y]).
299  * - a SizeError exception if the number of entries in the dictionary,
300  * i.e., the domain size of the RangeVariable, has already reached
301  * its maximum.
302  *
303  * @warning Note that missing values (i.e., string encoded as missing
304  * symbols) are translated as std::numeric_limits<std::size_t>::max ().
305  * @warning If the variable contained into the translator has a value in
306  * its range equal to a missing value symbol, then this value will be
307  * taken into account in the translation, not the missing value.
308  * @return the translated value of the string to be stored into a
309  * DatabaseTable
310  * @throws UnknownLabelInDatabase is raised if the translation cannot
311  * be found and the translator is not in an editable dictionary mode.
312  * @throws SizeError is raised if the number of entries (the range) in
313  * the dictionary has already reached its maximum.
314  * @throws TypeError is raised if the translation cannot be found and
315  * the translator is in an editable dictionary mode and the string does
316  * not correspond to an integer.
317  * @throws OperationNotAllowed exception is raised if the translation
318  * cannot be found and the insertion of the string into the translator's
319  * dictionary fails because it would induce incoherent behavior (e.g.,
320  * a translator that contains a variable whose domain is {x,y,z,t} as
321  * well as a missing value symbol z).
322  */
323  virtual DBTranslatedValue translate(const std::string& str) final;
324 
325  /// returns the original value for a given translation
326  /** @return the string that was translated into a given DBTranslatedValue.
327  * @throws UnknownLabelInDatabase is raised if this original value cannot
328  * be found */
329  virtual std::string translateBack(const DBTranslatedValue translated_val) const final;
330 
331  /// returns the domain size of a variable corresponding to the translations
332  /** Returns the size of the range of the variable. */
333  virtual std::size_t domainSize() const final;
334 
335  /** @brief indicates whether a reordering is needed to make the
336  * translations sorted by increasing numbers
337  *
338  * When constructing dynamically its dictionary, the translator may
339  * assign wrong DBTranslatedValue values to strings. For instance, a
340  * translator reading sequentially integer strings 2, 1, 3, may map
341  * 2 into DBTranslatedValue{std::size_t(0)},
342  * 1 into DBTranslatedValue{std::size_t(1)} and
343  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
344  * having domain {2,1,3}. The user may prefer having domain {1,2,3}, i.e.,
345  * a domain specified with increasing values. This requires a
346  * reordering. Method needsReodering() returns a Boolean indicating
347  * whether such a reordering should be performed or whether the current
348  * order is OK.
349  */
350  virtual bool needsReordering() const final;
351 
352  /** @brief performs a reordering of the dictionary and returns a mapping
353  * from the old translated values to the new ones.
354  *
355  * When a reordering is needed, i.e., string values must be translated
356  * differently, Method reorder() computes how the translations should be
357  * changed. It updates accordingly the dictionary and returns the mapping
358  * that enables changing the old dictionary values into the new ones.
359  */
360  virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
361  reorder() final;
362 
363  /// returns the variable stored into the translator
364  virtual const RangeVariable* variable() const final;
365 
366  /// returns the translation of a missing value
367  virtual DBTranslatedValue missingValue() const final;
368 
369  /// @}
370 
371 
372 #ifndef DOXYGEN_SHOULD_SKIP_THIS
373 
374  private:
375  // the RangeVariable assigned to the translator, if any
376  RangeVariable _variable_;
377 
378  // assign to each integer missing symbol a Boolean indicating whether
379  // we already translated it or not. If we translated it, then we cannot
380  // change the range of the variable so that this range contains the symbol.
381  HashTable< std::string, bool, ALLOC< std::pair< std::string, bool > > >
383 
384  // the set of translations of the integer missing symbols found so far
385  Set< long, ALLOC< long > > _translated_int_missing_symbols_;
386 
387  // a string containing a non int missing symbol
388  // (useful for back translations)
389  std::string _nonint_missing_symbol_;
390 
391 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
392  };
393 
394 
395  } /* namespace learning */
396 
397 } /* namespace gum */
398 
399 
400 // always include the template implementation
401 #include <agrum/tools/database/DBTranslator4RangeVariable_tpl.h>
402 
403 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_RANGE_VARIABLE_H */
virtual bool needsReordering() const final
indicates whether a reordering is needed to make the translations sorted by increasing numbers ...
DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC > &&from)
move constructor
virtual const RangeVariable * variable() const final
returns the variable stored into the translator
virtual std::string translateBack(const DBTranslatedValue translated_val) const final
returns the original value for a given translation
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
DBTranslator4RangeVariable< ALLOC > & operator=(DBTranslator4RangeVariable< ALLOC > &&from)
move operator
DBTranslator4RangeVariable(DBTranslator4RangeVariable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
DBTranslator4RangeVariable(std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without any initial variable nor missing symbols
DBTranslator4RangeVariable< ALLOC > & operator=(const DBTranslator4RangeVariable< ALLOC > &from)
copy operator
DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given translator
virtual ~DBTranslator4RangeVariable()
destructor
virtual DBTranslatedValue missingValue() const final
returns the translation of a missing value
DBTranslator4RangeVariable(const DBTranslator4RangeVariable< ALLOC > &from)
copy constructor
virtual DBTranslatedValue translate(const std::string &str) final
returns the translation of a string
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder() final
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
DBTranslator4RangeVariable(const RangeVariable &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool editable_dictionary=false, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a range variable as translator
virtual DBTranslator4RangeVariable< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
DBTranslator4RangeVariable(const RangeVariable &var, const bool editable_dictionary=false, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a range variable as translator but without missing symbols ...
virtual std::size_t domainSize() const final
returns the domain size of a variable corresponding to the translations
virtual DBTranslator4RangeVariable< ALLOC > * clone() const
virtual copy constructor
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DBTranslator4RangeVariable(const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without any initial variable