aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBTranslator4LabelizedVariable.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The databases' cell translators for labelized variables
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_TRANSLATOR_4_LABELIZED_VARIABLE_H
28 #define GUM_LEARNING_DB_TRANSLATOR_4_LABELIZED_VARIABLE_H
29 
30 #include <agrum/agrum.h>
31 #include <agrum/tools/database/DBTranslator.h>
32 #include <agrum/tools/variables/labelizedVariable.h>
33 
34 
35 namespace gum {
36 
37  namespace learning {
38 
39 
40  /** @class DBTranslator4LabelizedVariable
41  * @headerfile DBTranslator4LabelizedVariable.h <agrum/tools/database/DBTranslator4LabelizedVariable.h>
42  * @brief The databases' cell translators for labelized variables
43  *
44  * Translators are used by DatabaseTable instances to transform datasets'
45  * strings into DBTranslatedValue instances. The point is that strings are
46  * not adequate for fast learning, they need to be preprocessed into a type
47  * that can be analyzed quickly (the so-called DBTranslatedValue type).
48  *
49  * A DBTranslator4LabelizedVariable is a translator that contains and
50  * exploits a LabelizedVariable for translations. Each time a string needs
51  * be translated, we ask the LabelizedVariable to provide the index of
52  * the label corresponding to the string. This index, when encoded into a
53  * DBTranslatedValue, is precisely the translation of the string.
54  *
55  * @par Here is an example of how to use this class:
56  * @code
57  * // create the translator, with possible missing symbols: "N/A" and "???"
58  * // i.e., each time the translator reads a "N/A" or a "???" string, it
59  * // won't translate it into a number but into a missing value.
60  * std::vector<std::string> missing { "N/A", "???" };
61  * gum::learning::DBTranslator4LabelizedVariable<> translator ( missing );
62  *
63  * // gets the DBTranslatedValue corresponding to some strings:
64  * auto val1 = translator.translate("xxx");
65  * auto val2 = translator.translate("zzz");
66  * auto val3 = translator << "yyy";
67  * auto val2bis = translator.translate( "zzz" );
68  * // In the first assignment, the translator initially contains an empty
69  * // domain LabelizedVariable and it is by default in editable mode. So
70  * // we add a new label "xxx" to the LabelizedVariable contained in the
71  * // translator, and the index of this label is 0. Therefore, we have that
72  * // val1 = DBTranslatedValue {std::size_t(0)}. Similarly, the assignments of
73  * // val2 and val3 induce the additions of labels "zzz" and "yyy" into the
74  * // LabelizedVariable. As a result, val2 = DBTranslatedValue {std::size_t(1)}
75  * // and val3 = DBTranslatedValue {std::size_t(2)}. In the assigment of
76  * // val2bis, label "zzz" already exists and its index is equal to 1. So
77  * // val2bis = DBTranslatedValue {std::size_t(1)}.
78  *
79  * // add the numbers assigned to val1, val2, val3
80  * std::size_t sum = val1.discr_val + val2.discr_val + val3.discr_val;
81  *
82  * // translate missing values: val4 and val5 will be equal to:
83  * // DBTranslatedValue { std::numeric_limits<std::size_t>::max () }
84  * auto val4 = translator << "N/A";
85  * auto val5 = translator.translate ( "???" );
86  *
87  * // given a DBTranslatedValue that is supposed to contain a label's index,
88  * // get the corresponding label.
89  * std::string str;
90  * str = translator.translateBack ( val1 ); // str = "xxx"
91  * str = translator >> val2; // str = "zzz"
92  * str = translator >> gum::learning::DBTranslatedValue {std::size_t(1)};
93  * // str = "zzz"
94  * // if there is no such label, Exception NotFound is raised:
95  * str = translator >> gum::learning::DBTranslatedValue {std::size_t(4)};
96  *
97  * // translate back missing values: the string will corresponds to one of
98  * // the missing symbols known to the translator
99  * str = translator >> val4; // str = "N/A" or "???"
100  * str = translator >> val5; // str = "N/A" or "???"
101  *
102  * // get the domain size of the variable stored into the translator
103  * std::size_t size = translator.domainSize (); // size = 3
104  *
105  * // get the variable stored within the translator
106  * const gum::LabelizedVariable* var =
107  * dynamic_cast<const gum::LabelizedVariable*> ( translator.variable () );
108  *
109  * // it is possible to create a translator for an already known variable.
110  * // In this case, by default, the translator is not in editable mode, but
111  * // this behavior can be changed passing the right arguments to the
112  * // constructor of the translator, or using the setEditableDictionaryMode
113  * // method.
114  * gum::LabelizedVariable var ( "X1", "", 0 );
115  * var.addLabel ( "toto" );
116  * var.addLabel ( "titi" );
117  * var.addLabel ( "tutu" );
118  * gum::learning::DBTranslator4LabelizedVariable<> translator2 (var,missing);
119  *
120  * std::size_t index1 = translator2.translate ( "toto" ).discr_val; // = 0
121  * std::size_t index2 = translator2.translate ( "tutu" ).discr_val; // = 2
122  * std::size_t index3 = translator2.translate ( "N/A" ).discr_val;
123  * // here index3 corresponds to the index of a missing value, hence it is
124  * // equal to std::numeric_limits<std::size_t>::max ()
125  *
126  * // trying to translate a string which is not a label of var will raise
127  * // Exception NotFound
128  * translator2.translate ( "xxx" ); // NotFound
129  * @endcode
130  *
131  * @ingroup learning_database
132  */
133  template < template < typename > class ALLOC = std::allocator >
135  public:
136  /// type for the allocators passed in arguments of methods
138 
139  // ##########################################################################
140  /// @name Constructors / Destructors
141  // ##########################################################################
142 
143  /// @{
144 
145  /// default constructor without any initial variable
146  /** When using this constructor, it is assumed implicitly that the
147  * dictionary contained into the translator is editable. So, when reading
148  * the database, if we observe a label that has not been encountered
149  * before, we add it into the dictionary of the translator (hence into
150  * the variable contained by the translator).
151  * @param missing_symbols the set of symbols in the database
152  * representing missing values
153  * @param max_dico_entries the max number of entries that the dictionary
154  * can contain. If we try to add new entries in the dictionary, this will
155  * be considered as an error and a SizeError exception will be raised
156  * @param alloc The allocator used to allocate memory for all the
157  * fields of the DBTranslator4LabelizedVariable
158  */
159  template < template < typename > class XALLOC >
161  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
163  const allocator_type& alloc = allocator_type());
164 
165  /// default constructor without any initial variable nor missing symbols
166  /** When using this constructor, it is assumed implicitly that the
167  * dictionary contained into the translator is editable. So, when reading
168  * the database, if we observe a label that has not been encountered
169  * before, we add it into the dictionary of the translator (hence into
170  * the variable contained by the translator).
171  * @param max_dico_entries the max number of entries that the dictionary
172  * can contain. If we try to add new entries in the dictionary, this will
173  * be considered as an error and a SizeError exception will be raised
174  * @param alloc The allocator used to allocate memory for all the
175  * fields of the DBTranslator4LabelizedVariable
176  */
177  DBTranslator4LabelizedVariable(std::size_t max_dico_entries
178  = std::numeric_limits< std::size_t >::max(),
179  const allocator_type& alloc = allocator_type());
180 
181  /// default constructor with a labelized variable as translator
182  /** @param var a labelized variable whose labels will be used for
183  * translations. The translator keeps a copy of this variable
184  * @param missing_symbols the set of symbols in the database
185  * representing missing values
186  * @param editable_dictionary the mode in which the translator will perform
187  * translations: when false (the default), the translation of a string
188  * that does not correspond to a label of var will raise a NotFound
189  * exception; when true, the translator will try to add the string as
190  * a new label into var (and therefore into the dictionary)
191  * @param max_dico_entries the max number of entries that the dictionary
192  * can contain. If we try to add new entries in the dictionary, this will
193  * be considered as an error and a SizeError exception will be raised
194  * @param alloc The allocator used to allocate memory for all the
195  * fields of the DBTranslator4LabelizedVariable
196  * @warning If the variable contained into the translator has a label
197  * equal to a missing value symbol, the label will be taken into
198  * account in the translations, not the missing value.
199  */
200  template < template < typename > class XALLOC >
202  const LabelizedVariable& var,
203  const std::vector< std::string, XALLOC< std::string > >& missing_symbols,
204  const bool editable_dictionary = false,
206  const allocator_type& alloc = allocator_type());
207 
208  /** @brief default constructor with a labelized variable as translator
209  * but without missing symbols
210  *
211  * @param var a labelized variable whose labels will be used for
212  * translations. The translator keeps a copy of this variable
213  * @param editable_dictionary the mode in which the translator will perform
214  * translations: when false (the default), the translation of a string
215  * that does not correspond to a label of var will raise a NotFound
216  * exception; when true, the translator will try to add the string as
217  * a new label into var (and therefore into the dictionary)
218  * @param max_dico_entries the max number of entries that the dictionary
219  * can contain. If we try to add new entries in the dictionary, this will
220  * be considered as an error and a SizeError exception will be raised
221  * @param alloc The allocator used to allocate memory for all the
222  * fields of the DBTranslator4LabelizedVariable
223  * @warning If the variable contained into the translator has a label
224  * equal to a missing value symbol, the label will be taken into
225  * account in the translations, not the missing value.
226  */
227  DBTranslator4LabelizedVariable(const LabelizedVariable& var,
228  const bool editable_dictionary = false,
229  std::size_t max_dico_entries
230  = std::numeric_limits< std::size_t >::max(),
231  const allocator_type& alloc = allocator_type());
232 
233  /// copy constructor
234  DBTranslator4LabelizedVariable(const DBTranslator4LabelizedVariable< ALLOC >& from);
235 
236  /// copy constructor with a given allocator
237  DBTranslator4LabelizedVariable(const DBTranslator4LabelizedVariable< ALLOC >& from,
238  const allocator_type& alloc);
239 
240  /// move constructor
241  DBTranslator4LabelizedVariable(DBTranslator4LabelizedVariable< ALLOC >&& from);
242 
243  /// move constructor with a given allocator
244  DBTranslator4LabelizedVariable(DBTranslator4LabelizedVariable< ALLOC >&& from,
245  const allocator_type& alloc);
246 
247  /// virtual copy constructor
248  virtual DBTranslator4LabelizedVariable< ALLOC >* clone() const;
249 
250  /// virtual copy constructor with a given allocator
252 
253  /// destructor
255 
256  /// @}
257 
258 
259  // ##########################################################################
260  /// @name Operators
261  // ##########################################################################
262 
263  /// @{
264 
265  /// copy operator
268 
269  /// move operator
272 
273  /// @}
274 
275 
276  // ##########################################################################
277  /// @name Accessors / Modifiers
278  // ##########################################################################
279 
280  /// @{
281 
282  /// returns the translation of a string
283  /** This method tries to translate a given string into the
284  * DBTranslatedValue that should be stored into a DatabaseTable. If the
285  * translator cannot find the translation in its current dictionary, then
286  * two situations can obtain:
287  * -# if the translator is not in an editable dictionary mode, then the
288  * translator raises a NotFound exception.
289  * -# if the translator is in an editable dictionary mode, i.e., it is
290  * allowed to update its dictionary, then it tries to add the string
291  * as a new value in the dictionary (or equivalently as a new label
292  * into its labelized variable). Upon success, it returns the
293  * translated value, otherwise, it raises a SizeError exception if the
294  * number of entries in the dictionary has already reached its maximum.
295  *
296  * @warning Note that missing values (i.e., string encoded as missing
297  * symbols) are translated as std::numeric_limits<std::size_t>::max ().
298  * @warning If the variable contained into the translator has a label
299  * equal to a missing value symbol, the label will be taken into
300  * account in the translation, not the missing value.
301  * @param str the string that the translator will try to translate
302  * @return the translated value of the string to be stored into a
303  * DatabaseTable
304  * @throws UnknownLabelInDatabase is raised if the translation cannot be
305  * found and the translator is not in an editable dictionary mode.
306  * @throws SizeError is raised if the number of entries in the dictionary
307  * has already reached its maximum.
308  */
309  virtual DBTranslatedValue translate(const std::string& str) final;
310 
311  /// returns the original value for a given translation
312  /** @return the string that was translated into a given DBTranslatedValue.
313  * @param translated_val a DBTranslatedValue which is supposed to contain
314  * the index of a label of the LabelizedVariable contained in the
315  * translator. The method should return this label
316  * @throws UnknownLabelInDatabase is raised if this original value
317  * cannot be found */
318  virtual std::string translateBack(const DBTranslatedValue translated_val) const final;
319 
320  /// returns the domain size of a variable corresponding to the translations
321  /** Assume that the translator has been fed with the observed values of
322  * a random variable. Then it has produced a set of translated values. The
323  * latter define the domain of the variable. The domainSize is the size
324  * of this domain. In other words, this corresponds to the number of
325  * labels of the LabelizedVariable contained in the translator.
326  * Note that missing values are not taken into account in the domain
327  * sizes. */
328  virtual std::size_t domainSize() const final;
329 
330  /** @brief indicates whether a reordering is needed to make the
331  * translations sorted
332  *
333  * If the strings represented by the translations are only numbers,
334  * translations are considered to be sorted if and only if they are sorted
335  * by increasing number. If the strings do not only represent numbers, then
336  * translations are considered to be sorted if and only if they are sorted
337  * lexicographically.
338  *
339  * When constructing dynamically its dictionary, the translator may
340  * assign wrong DBTranslatedValue values to strings. For instance, a
341  * translator reading sequentially integer strings 4, 1, 3, may map
342  * 4 into DBTranslatedValue{std::size_t(0)},
343  * 1 into DBTranslatedValue{std::size_t(1)} and
344  * 3 into DBTranslatedValue{std::size_t(2)}, resulting in random variables
345  * having domain {4,1,3}. The user may prefer having domain {1,3,4}, i.e.,
346  * a domain specified with increasing values. This requires a
347  * reordering. Method needsReodering() returns a Boolean indicating
348  * whether such a reordering should be performed or whether the current
349  * order is OK. */
350  virtual bool needsReordering() const final;
351 
352  /** @brief performs a reordering of the dictionary and returns a mapping
353  * from the old translated values to the new ones.
354  *
355  * When a reordering is needed, i.e., string values must be translated
356  * differently. Method reorder() computes how the translations should be
357  * changed. It updates accordingly the dictionary and returns the mapping
358  * that enables changing the old dictionary values into the new ones.
359  * @warning If there is no reordering to perform, the method returns
360  * an empty hashtable. */
361  virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > >
362  reorder() final;
363 
364  /// returns the variable stored into the translator
365  virtual const LabelizedVariable* variable() const final;
366 
367  /// returns the translation of a missing value
368  virtual DBTranslatedValue missingValue() const final;
369 
370  /// @}
371 
372 #ifndef DOXYGEN_SHOULD_SKIP_THIS
373 
374  private:
375  // the LabelizedVariable assigned to the translator, if any
376  LabelizedVariable _variable_;
377 
378 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
379  };
380 
381 
382  } /* namespace learning */
383 
384 } /* namespace gum */
385 
386 
387 // always include the template implementation
388 #include <agrum/tools/database/DBTranslator4LabelizedVariable_tpl.h>
389 
390 #endif /* GUM_LEARNING_DB_TRANSLATOR_4_LABELIZED_VARIABLE_H */
virtual const LabelizedVariable * variable() const final
returns the variable stored into the translator
virtual DBTranslatedValue missingValue() const final
returns the translation of a missing value
DBTranslator4LabelizedVariable< ALLOC > & operator=(DBTranslator4LabelizedVariable< ALLOC > &&from)
move operator
virtual ~DBTranslator4LabelizedVariable()
destructor
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
DBTranslator4LabelizedVariable(const LabelizedVariable &var, const std::vector< std::string, XALLOC< std::string > > &missing_symbols, const bool editable_dictionary=false, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a labelized variable as translator
DBTranslator4LabelizedVariable(const std::vector< std::string, XALLOC< std::string > > &missing_symbols, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without any initial variable
virtual std::size_t domainSize() const final
returns the domain size of a variable corresponding to the translations
DBTranslator4LabelizedVariable(DBTranslator4LabelizedVariable< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual std::string translateBack(const DBTranslatedValue translated_val) const final
returns the original value for a given translation
DBTranslator4LabelizedVariable(std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor without any initial variable nor missing symbols
DBTranslator4LabelizedVariable< ALLOC > & operator=(const DBTranslator4LabelizedVariable< ALLOC > &from)
copy operator
virtual bool needsReordering() const final
indicates whether a reordering is needed to make the translations sorted
DBTranslator4LabelizedVariable(const DBTranslator4LabelizedVariable< ALLOC > &from)
copy constructor
virtual DBTranslatedValue translate(const std::string &str) final
returns the translation of a string
virtual DBTranslator4LabelizedVariable< ALLOC > * clone() const
virtual copy constructor
virtual DBTranslator4LabelizedVariable< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
virtual HashTable< std::size_t, std::size_t, ALLOC< std::pair< std::size_t, std::size_t > > > reorder() final
performs a reordering of the dictionary and returns a mapping from the old translated values to the n...
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
DBTranslator4LabelizedVariable(DBTranslator4LabelizedVariable< ALLOC > &&from)
move constructor
DBTranslator4LabelizedVariable(const LabelizedVariable &var, const bool editable_dictionary=false, std::size_t max_dico_entries=std::numeric_limits< std::size_t >::max(), const allocator_type &alloc=allocator_type())
default constructor with a labelized variable as translator but without missing symbols ...
DBTranslator4LabelizedVariable(const DBTranslator4LabelizedVariable< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator