aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBTranslator4DiscretizedVariable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for discretized variables
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/database/DBTranslator.h>
32 #include <agrum/tools/variables/discretizedVariable.h>
33 
34 
35 namespace gum {
36 
37  namespace learning {
38 
39 
40  /** @class DBTranslator4DiscretizedVariable
41  * @headerfile DBTranslator4DiscretizedVariable.h <agrum/tools/database/DBTranslator4DiscretizedVariable.h>
42  * @brief The databases' cell translators for discretized variables
43  *
44  * Translators are used by DatabaseTable instances to transform datasets'
45  * strings into DBTranslatedValue instances. The point is that strings are
46  * not adequate for fast learning, they need to be preprocessed into a type
47  * that can be analyzed quickly (the so-called DBTranslatedValue type).
48  *
49  * A DBTranslator4DiscretizedVariable is a translator that contains and
50  * exploits a DiscretizedVariable for translations. Each time a string needs
51  * be translated, we ask the DiscretizedVariable which discretization
52  * interval contains the the number represented by the string. The
53  * DBTranslatedValue corresponding to the translation of the string
54  * contains in its discr_val field the index of this discretization interval.
55  *
56  * @warning Translators for discretized variables are not editable, that is,
57  * you must provide the const variable that will be used for translations.
58  * Enabling the editable mode would not make much sense.
59  *
60  * @par Here is an example of how to use this class:
61  * @code
62  * // create the translator, with possible missing symbols: "N/A" and "???"
63  * // i.e., each time the translator reads a "N/A" or a "???" string, it
64  * // won't translate it into a number but into a missing value.
65  * std::vector<std::string> missing { "N/A", "???" };
66  * gum::DiscretizedVariable<int> var ( "X1", "" );
67  * var.addTick ( 1 );
68  * var.addTick ( 3 );
69  * var.addTick ( 10 );
70  * gum::learning::DBTranslator4DiscretizedVariable<> translator( var,missing );
71  *
72  * // gets the DBTranslatedValue corresponding to some strings
73  * auto val1 = translator.translate("5.2");
74  * auto val2 = translator << "2";
75  * // at this point, val1 and val2 are equal to
76  * // gum::learning::DBTranslatedValue { std::size_t(1) } and
77  * // gum::learning::DBTranslatedValue { std::size_t(0) } respectively
78  * // because the first discretization interval corresponds to [1;3[ and
79  * // the second one to [3;10[.
80  *
81  * // if the string contains a number outside the domain of the
82  * // DiscretizedVariable, then a gum::NotFound exception is raised:
83  * auto val3 = translator << "17"; // NotFound raised
84  *
85  * // add the numbers assigned to val1, val2
86  * std::size_t sum = val1.discr_val + val2.discr_val;
87  *
88  * // translate missing values: val4 and val5 will be equal to:
89  * // DBTranslatedValue { std::numeric_limits<std::size_t>::max () }
90  * auto val4 = translator << "N/A";
91  * auto val5 = translator.translate ( "???" );
92  *
93  * // the following instructions raise TypeError exceptions because the
94  * // strings are not numbers
95  * auto val6 = translator << "422x";
96  * auto val7 = translator.translate ( "xxx" );
97  *
98  * // given a DBTranslatedValue that is supposed to contain the index of
99  * // a discretization interval, get the string representing the interval.
100  * std::string str;
101  * str = translator.translateBack ( val1 ); // str = "[3,10["
102  * str = translator >> val2; // str = "[1;3["
103  * str = translator >> gum::learning::DBTranslatedValue {std::size_t(1)};
104  * // str = "[3;10["
105  *
106  * // translate back missing values: the string will corresponds to one of
107  * // the missing symbols known to the translator
108  * str = translator >> val4; // str = "N/A" or "???"
109  * str = translator >> val5; // str = "N/A" or "???"
110  *
111  * // get the variable stored within the translator
112  * const gum::DiscretizedVariable<float>* var =
113  * dynamic_cast<const gum::DiscretizedVariable<float>*>
114  * ( translator.variable () );
115  * @endcode
116  *
117  * @ingroup learning_database
118  */
119  template < template < typename > class ALLOC = std::allocator >
121  public:
122  /// type for the allocators passed in arguments of methods
124 
125 
126  // ##########################################################################
127  /// @name Constructors / Destructors
128  // ##########################################################################
129 
130  /// @{
131 
132  /// default constructor with a discretized variable as translator
133  /** @param var a discretized variable which will be used for translations.
134  * The translator keeps a copy of this variable
135  * @param missing_symbols the set of symbols in the dataset
136  * representing missing values
137  * @param max_dico_entries the max number of entries that the dictionary
138  * can contain. During the construction, we check that the discretized
139  * variable passed in argument has fewer discretization intervals than
140  * the admissible dictionary size
141  * @param alloc The allocator used to allocate memory for all the
142  * fields of the DBTranslator4DiscretizedVariable
143  * @warning If the variable contained into the translator has a label
144  * equal to a missing value symbol, the label will be taken into
145  * account in the translation, not the missing value. */
146  template < typename GUM_SCALAR, template < typename > class XALLOC >
148  const DiscretizedVariable< GUM_SCALAR >& var,
149  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
151  const allocator_type& alloc = allocator_type());
152 
153  /** @brief default constructor with a discretized variable as translator
154  * but without missing symbols
155  *
156  * @param var a discretized variable which will be used for translations.
157  * The translator keeps a copy of this variable
158  * @param max_dico_entries the max number of entries that the dictionary
159  * can contain. During the construction, we check that the discretized
160  * variable passed in argument has fewer discretization intervals than
161  * the admissible dictionary size
162  * @param alloc The allocator used to allocate memory for all the
163  * fields of the DBTranslator4DiscretizedVariable
164  * @warning If the variable contained into the translator has a label
165  * equal to a missing value symbol, the label will be taken into
166  * account in the translation, not the missing value. */
167  template < typename GUM_SCALAR >
169  const DiscretizedVariable< GUM_SCALAR >& var,
170  std::size_t max_dico_entries = std::numeric_limits< std::size_t >::max(),
171  const allocator_type& alloc = allocator_type());
172 
173  /// default constructor with a IDiscretized variable as translator
174  /** @param var a IDiscretized variable which will be used for translations.
175  * The translator keeps a copy of this variable
176  * @param missing_symbols the set of symbols in the dataset
177  * representing missing values
178  * @param max_dico_entries the max number of entries that the dictionary
179  * can contain. During the construction, we check that the discretized
180  * variable passed in argument has fewer discretization intervals than
181  * the admissible dictionary size
182  * @param alloc The allocator used to allocate memory for all the
183  * fields of the DBTranslator4DiscretizedVariable
184  * @warning If the variable contained into the translator has a label
185  * equal to a missing value symbol, the label will be taken into
186  * account in the translation, not the missing value. */
187  template < template < typename > class XALLOC >
189  const IDiscretizedVariable& var,
190  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
192  const allocator_type& alloc = allocator_type());
193 
194  /** @brief default constructor with a IDiscretized variable as translator
195  * but without missing symbols
196  *
197  * @param var a discretized variable which will be used for translations.
198  * The translator keeps a copy of this variable
199  * @param max_dico_entries the max number of entries that the dictionary
200  * can contain. During the construction, we check that the discretized
201  * variable passed in argument has fewer discretization intervals than
202  * the admissible dictionary size
203  * @param alloc The allocator used to allocate memory for all the
204  * fields of the DBTranslator4DiscretizedVariable
205  * @warning If the variable contained into the translator has a label
206  * equal to a missing value symbol, the label will be taken into
207  * account in the translation, not the missing value. */
208  DBTranslator4DiscretizedVariable(const IDiscretizedVariable& var,
209  std::size_t max_dico_entries
210  = std::numeric_limits< std::size_t >::max(),
211  const allocator_type& alloc
212  = allocator_type());
213 
214  /// copy constructor
216  const DBTranslator4DiscretizedVariable< ALLOC >& from);
217 
218  /// copy constructor with a given allocator
220  const DBTranslator4DiscretizedVariable< ALLOC >& from,
221  const allocator_type& alloc);
222 
223  /// move constructor
225  DBTranslator4DiscretizedVariable< ALLOC >&& from);
226 
227  /// move constructor with a given allocator
229  DBTranslator4DiscretizedVariable< ALLOC >&& from,
230  const allocator_type& alloc);
231 
232  /// virtual copy constructor
233  virtual DBTranslator4DiscretizedVariable< ALLOC >* clone() const;
234 
235  /// virtual copy constructor with a given allocator
237  clone(const allocator_type& alloc) const;
238 
239  /// destructor
241 
242  /// @}
243 
244 
245  // ##########################################################################
246  /// @name Operators
247  // ##########################################################################
248 
249  /// @{
250 
251  /// copy operator
254 
255  /// move operator
258 
259  /// @}
260 
261 
262  // ##########################################################################
263  /// @name Accessors / Modifiers
264  // ##########################################################################
265 
266  /// @{
267 
268  /// returns the translation of a string
269  /** This method tries to translate a given string into the
270  * DBTranslatedValue that should be stored into a databaseTable. If the
271  * translator cannot find the translation in its current dictionary, then
272  * the translator raises either a TypeError if the string is not a number
273  * or a NotFound exception.
274  *
275  * @warning Note that missing values (i.e., string encoded as missing
276  * symbols) are translated as std::numeric_limits<std::size_t>::max ().
277  * @warning If the variable contained into the translator has a
278  * discretization interval that contains a missing value symbol, the
279  * interval will be taken into account in the translation, not the missing
280  * value.
281  * @return the translated value of the string to be stored into a
282  * DatabaseTable
283  * @throws UnknownLabelInDatabase is raised if the translation cannot be
284  * found.
285  * @throws TypeError is raised if the translation cannot be found and
286  * the translator and the string does not correspond to a number. */
287  virtual DBTranslatedValue translate(const std::string& str) final;
288 
289  /// returns the original value for a given translation
290  /** @return the string that was translated into a given DBTranslatedValue.
291  * @throws UnknownLabelInDatabase is raised if this original value
292  * cannot be found */
293  virtual std::string
295 
296  /// returns the number of discretization intervals used for translations
297  /** @warning Note that missing values are encoded as
298  * std::numeric_limits<>::max () and are not taken into account in the
299  * domain sizes. */
300  virtual std::size_t domainSize() const final;
301 
302  /// indicates that the translator is never in editable dictionary mode
303  virtual bool hasEditableDictionary() const final;
304 
305  /// sets/unset the editable dictionary mode
306  virtual void setEditableDictionaryMode(bool new_mode) final;
307 
308  /// indicates that the translations should never be reordered
309  virtual bool needsReordering() const final;
310 
311  /** @brief returns an empty HashTable to indicate that no reordering
312  * is needed. */
313  virtual HashTable< std::size_t,
314  std::size_t,
315  ALLOC< std::pair< std::size_t, std::size_t > > >
316  reorder() final;
317 
318  /// returns the variable stored into the translator
319  virtual const IDiscretizedVariable* variable() const final;
320 
321  /// returns the translation of a missing value
322  virtual DBTranslatedValue missingValue() const final;
323 
324  /// @}
325 
326 
327 #ifndef DOXYGEN_SHOULD_SKIP_THIS
328 
329  private:
330  // the DiscretizedVariable used for translations
331  DiscretizedVariable< float > variable__;
332 
333  // the DiscretizedVariable returned by method variable ()
334  // We must return a IDiscretizedVariable because the user may have
335  // saved into the translator a DiscretizedVariable<X>, with X != float
336  IDiscretizedVariable* real_variable__;
337 
338 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
339  };
340 
341 
342  } /* namespace learning */
343 
344 } /* namespace gum */
345 
346 
347 // always include the template implementation
348 #include <agrum/tools/database/DBTranslator4DiscretizedVariable_tpl.h>
349 
350 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_DISCRETIZED_VARIABLE_H */
DBTranslator4DiscretizedVariable< ALLOC > & operator=(const DBTranslator4DiscretizedVariable< ALLOC > &from)
copy operator
virtual DBTranslator4DiscretizedVariable< ALLOC > * clone() const
virtual copy constructor
DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual DBTranslatedValue translate(const std::string &str) final
returns the translation of a string
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
DBTranslator4DiscretizedVariable(const DiscretizedVariable< GUM_SCALAR > &var, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a discretized variable as translator but without missing symbols ...
DBTranslator4DiscretizedVariable(const IDiscretizedVariable &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a IDiscretized variable as translator
virtual std::string translateBack(const DBTranslatedValue translated_val) const final
returns the original value for a given translation
DBTranslator4DiscretizedVariable(const IDiscretizedVariable &var, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a IDiscretized variable as translator but without missing symbols ...
virtual const IDiscretizedVariable * variable() const final
returns the variable stored into the translator
virtual DBTranslator4DiscretizedVariable< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
virtual DBTranslatedValue missingValue() const final
returns the translation of a missing value
DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
virtual bool hasEditableDictionary() const final
indicates that the translator is never in editable dictionary mode
virtual bool needsReordering() const final
indicates that the translations should never be reordered
DBTranslator4DiscretizedVariable(const DiscretizedVariable< GUM_SCALAR > &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a discretized variable as translator
DBTranslator4DiscretizedVariable(const DBTranslator4DiscretizedVariable< ALLOC > &from)
copy constructor
virtual std::size_t domainSize() const final
returns the number of discretization intervals used for translations
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder() final
returns an empty HashTable to indicate that no reordering is needed.
DBTranslator4DiscretizedVariable< ALLOC > & operator=(DBTranslator4DiscretizedVariable< ALLOC > &&from)
move operator
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DBTranslator4DiscretizedVariable(DBTranslator4DiscretizedVariable< ALLOC > &&from)
move constructor
virtual void setEditableDictionaryMode(bool new_mode) final
sets/unset the editable dictionary mode