aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBTranslator4DiscretizedVariable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for discretized variables
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/database/DBTranslator.h>
32 #include <agrum/tools/variables/discretizedVariable.h>
33 
34 
35 namespace gum {
36 
37  namespace learning {
38 
39 
40  /** @class DBTranslator4DiscretizedVariable
41  * @headerfile DBTranslator4DiscretizedVariable.h <agrum/tools/database/DBTranslator4DiscretizedVariable.h>
42  * @brief The databases' cell translators for discretized variables
43  *
44  * Translators are used by DatabaseTable instances to transform datasets'
45  * strings into DBTranslatedValue instances. The point is that strings are
46  * not adequate for fast learning, they need to be preprocessed into a type
47  * that can be analyzed quickly (the so-called DBTranslatedValue type).
48  *
49  * A DBTranslator4DiscretizedVariable is a translator that contains and
50  * exploits a DiscretizedVariable for translations. Each time a string needs
51  * be translated, we ask the DiscretizedVariable which discretization
52  * interval contains the the number represented by the string. The
53  * DBTranslatedValue corresponding to the translation of the string
54  * contains in its discr_val field the index of this discretization interval.
55  *
56  * @warning Translators for discretized variables are not editable, that is,
57  * you must provide the const variable that will be used for translations.
58  * Enabling the editable mode would not make much sense.
59  *
60  * @par Here is an example of how to use this class:
61  * @code
62  * // create the translator, with possible missing symbols: "N/A" and "???"
63  * // i.e., each time the translator reads a "N/A" or a "???" string, it
64  * // won't translate it into a number but into a missing value.
65  * std::vector<std::string> missing { "N/A", "???" };
66  * gum::DiscretizedVariable<int> var ( "X1", "" );
67  * var.addTick ( 1 );
68  * var.addTick ( 3 );
69  * var.addTick ( 10 );
70  * gum::learning::DBTranslator4DiscretizedVariable<> translator( var,missing );
71  *
72  * // gets the DBTranslatedValue corresponding to some strings
73  * auto val1 = translator.translate("5.2");
74  * auto val2 = translator << "2";
75  * // at this point, val1 and val2 are equal to
76  * // gum::learning::DBTranslatedValue { std::size_t(1) } and
77  * // gum::learning::DBTranslatedValue { std::size_t(0) } respectively
78  * // because the first discretization interval corresponds to [1;3[ and
79  * // the second one to [3;10[.
80  *
81  * // if the string contains a number outside the domain of the
82  * // DiscretizedVariable, then a gum::NotFound exception is raised:
83  * auto val3 = translator << "17"; // NotFound raised
84  *
85  * // add the numbers assigned to val1, val2
86  * std::size_t sum = val1.discr_val + val2.discr_val;
87  *
88  * // translate missing values: val4 and val5 will be equal to:
89  * // DBTranslatedValue { std::numeric_limits<std::size_t>::max () }
90  * auto val4 = translator << "N/A";
91  * auto val5 = translator.translate ( "???" );
92  *
93  * // the following instructions raise TypeError exceptions because the
94  * // strings are not numbers
95  * auto val6 = translator << "422x";
96  * auto val7 = translator.translate ( "xxx" );
97  *
98  * // given a DBTranslatedValue that is supposed to contain the index of
99  * // a discretization interval, get the string representing the interval.
100  * std::string str;
101  * str = translator.translateBack ( val1 ); // str = "[3,10["
102  * str = translator >> val2; // str = "[1;3["
103  * str = translator >> gum::learning::DBTranslatedValue {std::size_t(1)};
104  * // str = "[3;10["
105  *
106  * // translate back missing values: the string will corresponds to one of
107  * // the missing symbols known to the translator
108  * str = translator >> val4; // str = "N/A" or "???"
109  * str = translator >> val5; // str = "N/A" or "???"
110  *
111  * // get the variable stored within the translator
112  * const gum::DiscretizedVariable<float>* var =
113  * dynamic_cast<const gum::DiscretizedVariable<float>*>
114  * ( translator.variable () );
115  * @endcode
116  *
117  * @ingroup learning_database
118  */
119  template < template < typename > class ALLOC = std::allocator >
121  public:
122  /// type for the allocators passed in arguments of methods
124 
125 
126  // ##########################################################################
127  /// @name Constructors / Destructors
128  // ##########################################################################
129 
130  /// @{
131 
132  /// default constructor with a discretized variable as translator
133  /** @param var a discretized variable which will be used for translations.
134  * The translator keeps a copy of this variable
135  * @param missing_symbols the set of symbols in the dataset
136  * representing missing values
137  * @param max_dico_entries the max number of entries that the dictionary
138  * can contain. During the construction, we check that the discretized
139  * variable passed in argument has fewer discretization intervals than
140  * the admissible dictionary size
141  * @param alloc The allocator used to allocate memory for all the
142  * fields of the DBTranslator4DiscretizedVariable
143  * @warning If the variable contained into the translator has a label
144  * equal to a missing value symbol, the label will be taken into
145  * account in the translation, not the missing value. */
146  template < typename GUM_SCALAR, template < typename > class XALLOC >
148  const DiscretizedVariable< GUM_SCALAR >& var,
149  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
151  const allocator_type& alloc = allocator_type());
152 
153  /** @brief default constructor with a discretized variable as translator
154  * but without missing symbols
155  *
156  * @param var a discretized variable which will be used for translations.
157  * The translator keeps a copy of this variable
158  * @param max_dico_entries the max number of entries that the dictionary
159  * can contain. During the construction, we check that the discretized
160  * variable passed in argument has fewer discretization intervals than
161  * the admissible dictionary size
162  * @param alloc The allocator used to allocate memory for all the
163  * fields of the DBTranslator4DiscretizedVariable
164  * @warning If the variable contained into the translator has a label
165  * equal to a missing value symbol, the label will be taken into
166  * account in the translation, not the missing value. */
167  template < typename GUM_SCALAR >
168  DBTranslator4DiscretizedVariable(const DiscretizedVariable< GUM_SCALAR >& var,
169  std::size_t max_dico_entries
170  = std::numeric_limits< std::size_t >::max(),
171  const allocator_type& alloc = allocator_type());
172 
173  /// default constructor with a IDiscretized variable as translator
174  /** @param var a IDiscretized variable which will be used for translations.
175  * The translator keeps a copy of this variable
176  * @param missing_symbols the set of symbols in the dataset
177  * representing missing values
178  * @param max_dico_entries the max number of entries that the dictionary
179  * can contain. During the construction, we check that the discretized
180  * variable passed in argument has fewer discretization intervals than
181  * the admissible dictionary size
182  * @param alloc The allocator used to allocate memory for all the
183  * fields of the DBTranslator4DiscretizedVariable
184  * @warning If the variable contained into the translator has a label
185  * equal to a missing value symbol, the label will be taken into
186  * account in the translation, not the missing value. */
187  template < template < typename > class XALLOC >
189  const IDiscretizedVariable& var,
190  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
192  const allocator_type& alloc = allocator_type());
193 
194  /** @brief default constructor with a IDiscretized variable as translator
195  * but without missing symbols
196  *
197  * @param var a discretized variable which will be used for translations.
198  * The translator keeps a copy of this variable
199  * @param max_dico_entries the max number of entries that the dictionary
200  * can contain. During the construction, we check that the discretized
201  * variable passed in argument has fewer discretization intervals than
202  * the admissible dictionary size
203  * @param alloc The allocator used to allocate memory for all the
204  * fields of the DBTranslator4DiscretizedVariable
205  * @warning If the variable contained into the translator has a label
206  * equal to a missing value symbol, the label will be taken into
207  * account in the translation, not the missing value. */
208  DBTranslator4DiscretizedVariable(const IDiscretizedVariable& var,
209  std::size_t max_dico_entries
210  = std::numeric_limits< std::size_t >::max(),
211  const allocator_type& alloc = allocator_type());
212 
213  /// copy constructor
214  DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC >& from);
215 
216  /// copy constructor with a given allocator
217  DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC >& from,
218  const allocator_type& alloc);
219 
220  /// move constructor
221  DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC >&& from);
222 
223  /// move constructor with a given allocator
224  DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC >&& from,
225  const allocator_type& alloc);
226 
227  /// virtual copy constructor
228  virtual DBTranslator4DiscretizedVariable< ALLOC >* clone() const;
229 
230  /// virtual copy constructor with a given allocator
232 
233  /// destructor
235 
236  /// @}
237 
238 
239  // ##########################################################################
240  /// @name Operators
241  // ##########################################################################
242 
243  /// @{
244 
245  /// copy operator
248 
249  /// move operator
252 
253  /// @}
254 
255 
256  // ##########################################################################
257  /// @name Accessors / Modifiers
258  // ##########################################################################
259 
260  /// @{
261 
262  /// returns the translation of a string
263  /** This method tries to translate a given string into the
264  * DBTranslatedValue that should be stored into a databaseTable. If the
265  * translator cannot find the translation in its current dictionary, then
266  * the translator raises either a TypeError if the string is not a number
267  * or a NotFound exception.
268  *
269  * @warning Note that missing values (i.e., string encoded as missing
270  * symbols) are translated as std::numeric_limits<std::size_t>::max ().
271  * @warning If the variable contained into the translator has a
272  * discretization interval that contains a missing value symbol, the
273  * interval will be taken into account in the translation, not the missing
274  * value.
275  * @return the translated value of the string to be stored into a
276  * DatabaseTable
277  * @throws UnknownLabelInDatabase is raised if the translation cannot be
278  * found.
279  * @throws TypeError is raised if the translation cannot be found and
280  * the translator and the string does not correspond to a number. */
281  virtual DBTranslatedValue translate(const std::string& str) final;
282 
283  /// returns the original value for a given translation
284  /** @return the string that was translated into a given DBTranslatedValue.
285  * @throws UnknownLabelInDatabase is raised if this original value
286  * cannot be found */
287  virtual std::string translateBack(const DBTranslatedValue translated_val) const final;
288 
289  /// returns the number of discretization intervals used for translations
290  /** @warning Note that missing values are encoded as
291  * std::numeric_limits<>::max () and are not taken into account in the
292  * domain sizes. */
293  virtual std::size_t domainSize() const final;
294 
295  /// indicates that the translator is never in editable dictionary mode
296  virtual bool hasEditableDictionary() const final;
297 
298  /// sets/unset the editable dictionary mode
299  virtual void setEditableDictionaryMode(bool new_mode) final;
300 
301  /// indicates that the translations should never be reordered
302  virtual bool needsReordering() const final;
303 
304  /** @brief returns an empty HashTable to indicate that no reordering
305  * is needed. */
306  virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
307  reorder() final;
308 
309  /// returns the variable stored into the translator
310  virtual const IDiscretizedVariable* variable() const final;
311 
312  /// returns the translation of a missing value
313  virtual DBTranslatedValue missingValue() const final;
314 
315  /// @}
316 
317 
318 #ifndef DOXYGEN_SHOULD_SKIP_THIS
319 
320  private:
321  // the DiscretizedVariable used for translations
322  DiscretizedVariable< float > _variable_;
323 
324  // the DiscretizedVariable returned by method variable ()
325  // We must return a IDiscretizedVariable because the user may have
326  // saved into the translator a DiscretizedVariable<X>, with X != float
327  IDiscretizedVariable* _real_variable_;
328 
329 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
330  };
331 
332 
333  } /* namespace learning */
334 
335 } /* namespace gum */
336 
337 
338 // always include the template implementation
339 #include <agrum/tools/database/DBTranslator4DiscretizedVariable_tpl.h>
340 
341 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H */
DBTranslator4DiscretizedVariable< ALLOC > & operator=(const DBTranslator4DiscretizedVariable< ALLOC > &from)
copy operator
virtual DBTranslator4DiscretizedVariable< ALLOC > * clone() const
virtual copy constructor
DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual DBTranslatedValue translate(const std::string &str) final
returns the translation of a string
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
DBTranslator4DiscretizedVariable(const DiscretizedVariable< GUM_SCALAR > &var, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a discretized variable as translator but without missing symbols ...
DBTranslator4DiscretizedVariable(const IDiscretizedVariable &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a IDiscretized variable as translator
virtual std::string translateBack(const DBTranslatedValue translated_val) const final
returns the original value for a given translation
DBTranslator4DiscretizedVariable(const IDiscretizedVariable &var, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a IDiscretized variable as translator but without missing symbols ...
virtual const IDiscretizedVariable * variable() const final
returns the variable stored into the translator
virtual DBTranslator4DiscretizedVariable< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
virtual DBTranslatedValue missingValue() const final
returns the translation of a missing value
DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
virtual bool hasEditableDictionary() const final
indicates that the translator is never in editable dictionary mode
virtual bool needsReordering() const final
indicates that the translations should never be reordered
DBTranslator4DiscretizedVariable(const DiscretizedVariable< GUM_SCALAR > &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a discretized variable as translator
DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC > &from)
copy constructor
virtual std::size_t domainSize() const final
returns the number of discretization intervals used for translations
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder() final
returns an empty HashTable to indicate that no reordering is needed.
DBTranslator4DiscretizedVariable< ALLOC > & operator=(DBTranslator4DiscretizedVariable< ALLOC > &&from)
move operator
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC > &&from)
move constructor
virtual void setEditableDictionaryMode(bool new_mode) final
sets/unset the editable dictionary mode