aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBCell.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class representing the original values of the cells of databases
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DB_CELL_H
28 #define GUM_LEARNING_DB_CELL_H
29 
30 #include <cstring>
31 #include <stdexcept>
32 #include <string>
33 #include <type_traits>
34 #include <utility>
35 
36 #include <agrum/agrum.h>
37 #include <agrum/tools/core/bijection.h>
38 
39 namespace gum {
40 
41  namespace learning {
42 
43  /** @class DBCell
44  * @headerfile DBCell.h <agrum/tools/database/DBCell.h>
45  * @brief The class representing the original values of the cells
46  * of databases
47  *
48  * Learning algorithms read input data, the so-called datasets, and
49  * infer their models. To be fast, they should not parse directly
50  * the original datasets, which are often databases of strings, but rather
51  * they should parse preprocessed data with types more suited for fast
52  * analysis. To do so, in aGrUM, learning algorithms parse datasets of
53  * DBTranslatedValue instances. However, sometimes, these algorithms make
54  * some decisions which, afterwards, they may realize were not so good and
55  * should be modified. For instance, when coping with continuous variables,
56  * they may use a discretization for fast learning the structure of a hybrid
57  * Bayesian network. But, given the structure found, they may try to find
58  * a better discretization. With a new discretization, the original dataset
59  * should be parsed again and mapped into new DBTranslatedValue instances.
60  * Unfortunately, reparsing, say, a CSV file or a SQL database, is time
61  * consuming. Sometimes, it is faster to read such datasets once and store
62  * them in an appropriate form that can be mapped easily into
63  * DBTranslatedValue instances. The RawDatabaseTable class is made precisely
64  * for this purpose and can be thought of as a 2-dimensional table, the
65  * elements of which are DBCell instances. The latter can actually encode
66  * compactly integers, real numbers, strings and even missing values. So they
67  * are very well suited to be the type of the cells of the RawDatabaseTable
68  * class.
69  *
70  * @ingroup learning_database */
71  class DBCell {
72  public:
73  /// the set of types possibly taken by the last element read
74  enum class EltType : unsigned char
75  {
76  REAL,
77  INTEGER,
78  STRING,
79  MISSING
80  };
81 
82  // ##########################################################################
83  /// @name Constructors / Destructors
84  // ##########################################################################
85 
86  /// @{
87 
88  /// default constructor (ontains a missing value)
89  DBCell();
90 
91  /// constructor for a real number
92  DBCell(const float nb);
93 
94  /// constructor for an integer number
95  DBCell(const int nb);
96 
97  /// constructor for a string
98  DBCell(const std::string& str);
99 
100  /// copy constructor
101  DBCell(const DBCell& from);
102 
103  /// move constructor
104  DBCell(DBCell&& from);
105 
106  /// destructor
107  ~DBCell();
108 
109  /// @}
110 
111 
112  // ##########################################################################
113  /// @name Operators
114  // ##########################################################################
115 
116  /// @{
117 
118  /// copy operator
119  DBCell& operator=(const DBCell& from);
120 
121  /// move operator
122  DBCell& operator=(DBCell&& from);
123 
124  /// assignment operator
125  DBCell& operator=(const float x);
126 
127  /// assignment operator
128  DBCell& operator=(const int x);
129 
130  /// assignment operator
131  DBCell& operator=(const std::string& x);
132 
133  /// test of equality
134  bool operator==(const DBCell& from) const;
135 
136  /// test of inequality
137  bool operator!=(const DBCell& from) const;
138 
139  /// @}
140 
141 
142  // ##########################################################################
143  /// @name Accessors / Modifiers
144  // ##########################################################################
145 
146  /// @{
147 
148  /// returns the current type of the DBCell
149  EltType type() const noexcept;
150 
151  /// try to convert the content of the DBCell into another type
152  /** @return true if the conversion has been successfully performed. */
153  bool convertType(const EltType newtype);
154 
155  /// returns the DBcell as a real number
156  /** @warning if the cell is not of type REAL, the dbcell will not try to
157  * convert its content into a real number, it will raise a TypeError
158  * exception.
159  * @return the content of the DBCell (if this is a real number)
160  * @throw TypeError if the DBCell is not labelled as type REAL */
161  float real() const;
162 
163  /// sets the content of the DBCell
164  void setReal(const float x);
165 
166  /// sets the content of the DBCell from a string
167  /** @throw TypeError if the string does not correspond to a real number */
168  void setReal(const std::string& elt);
169 
170  /// returns the DBcell as an integer
171  /** @warning if the cell is not of type INTEGER, the dbcell will not try
172  * to convert its content into a float, it will raise a TypeError
173  * exception.
174  * @throw TypeError if the DBCell is not labelled as type INTEGER */
175  int integer() const;
176 
177  /// sets the content of the DBCell
178  void setInteger(const int x);
179 
180  /// sets the content of the DBCell from a string
181  /** @throw TypeError if the string does not correspond to an integer */
182  void setInteger(const std::string& elt);
183 
184  /// returns the DBcell as a string
185  /** @warning if the cell is not of type STRING, the dbcell will not try to
186  * convert its content into a string, it will raise a TypeError exception.
187  * @throw TypeError if the DBCell is not labelled as type STRING */
188  const std::string& string() const;
189 
190  /** @brief returns the DBcell as the index of a string in a static
191  * bijection
192  *
193  * All strings referenced by DBCells are stored in a bijection. This method
194  * returns the index of the string referenced by the current DBCell.
195  * @throw TypeError if the DBCell is not labelled as type STRING */
196  int stringIndex() const;
197 
198  /// sets the content of the DBCell
199  void setString(const std::string& elt);
200 
201  /// sets the DBCell as a missing element
202  void setMissingState();
203 
204  /// indicates whether the cell contains a missing value
205  bool isMissing() const;
206 
207  /// @}
208 
209 
210  // ##########################################################################
211  /// @name Public Static Accessors / Modifiers
212  // ##########################################################################
213 
214  /// @{
215 
216  /// strings are stored into a static bijection. Get its ith string
217  /** @throw UndefinedElement if the index does not correspond to
218  * any string */
219  static const std::string& string(const int index);
220 
221  /// returns the best type to store a given element encoded as a string
222  /** @param str the string to convert into a DBCell
223  * @param missingVals a vector containing the set of strings that should
224  * be interpreted as missing values. Whenever str matches one these strings,
225  * the returned EltType represents a missing value. */
226  template < template < typename > class ALLOC = std::allocator >
227  static EltType bestType(const std::string& str,
228  const std::vector< std::string, ALLOC< std::string > >& missingVals);
229 
230  /// returns the DBCell with the best type for an element encoded as a string
231  /** @param str the string to convert into a DBCell
232  * @param missingVals a vector containing the set of strings that should
233  * be interpreted as missing values. Whenever str matches one these strings,
234  * the returned DBCell represents a missing value. */
235  template < template < typename > class ALLOC = std::allocator >
236  static DBCell bestDBCell(const std::string& str,
237  const std::vector< std::string, ALLOC< std::string > >& missingVals);
238 
239  /// returns the content of the DBCell as a string, whatever its type
240  /** @throw UndefinedElement is raised if the DBCell corresponds to a
241  * missing value but the set of missing values passed in argument is
242  * empty. */
243  template < template < typename > class ALLOC = std::allocator >
244  std::string
245  toString(const std::vector< std::string, ALLOC< std::string > >& missingVals) const;
246 
247  /// determines whether a string corresponds precisely to an integer
248  static bool isInteger(const std::string& str);
249 
250  /// determine whether a string corresponds precisely to a real number
251  static bool isReal(const std::string& str);
252 
253  /// checks whether a string correspond to a missing value
254  template < template < typename > class ALLOC = std::allocator >
255  static bool isMissing(const std::string& str,
256  const std::vector< std::string, ALLOC< std::string > >& missingVals);
257 
258  /// @}
259 
260 
261 #ifndef DOXYGEN_SHOULD_SKIP_THIS
262 
263  private:
264  // the real type of the last element read from the database
265  EltType _type_{EltType::MISSING};
266 
267  // the element read from the database
268  union {
269  int _val_index_; // stores string indices. Basically, it should have
270  int _val_integer_; // been an Idx, but int are shorter than Idx.
271  float _val_real_;
272  };
273 
274 
275  // determine the longest type of the union. This is used for fast
276  // copying/moving DBCells
277  using UnionType = typename std::conditional< sizeof(int) < sizeof(float), float, int >::type;
278 
279  // raises an appropriate exception when encountering a type error
280  std::string _typeErrorMsg_(const std::string& real_type) const;
281 
282 
283  // a bijection assigning to each string index its corresponding string
284  static Bijection< std::string, int >& _strings_();
285 
286  // the last index used so far
287  static int _string_max_index_;
288 
289 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
290  };
291 
292  } /* namespace learning */
293 
294 } /* namespace gum */
295 
296 /// include the inlined functions if necessary
297 #ifndef GUM_NO_INLINE
298 # include <agrum/tools/database/DBCell_inl.h>
299 #endif /* GUM_NO_INLINE */
300 
301 #include <agrum/tools/database/DBCell_tpl.h>
302 
303 #endif /* GUM_LEARNING_DB_CELL_H */
~DBCell()
destructor
bool convertType(const EltType newtype)
try to convert the content of the DBCell into another type
void setString(const std::string &elt)
sets the content of the DBCell
DBCell(const int nb)
constructor for an integer number
DBCell(const std::string &str)
constructor for a string
bool isMissing() const
indicates whether the cell contains a missing value
static bool isInteger(const std::string &str)
determines whether a string corresponds precisely to an integer
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
The class representing the original values of the cells of databases.
Definition: DBCell.h:71
DBCell & operator=(const float x)
assignment operator
void setReal(const std::string &elt)
sets the content of the DBCell from a string
DBCell & operator=(DBCell &&from)
move operator
DBCell(const float nb)
constructor for a real number
static bool isMissing(const std::string &str, const std::vector< std::string, ALLOC< std::string > > &missingVals)
checks whether a string correspond to a missing value
static DBCell bestDBCell(const std::string &str, const std::vector< std::string, ALLOC< std::string > > &missingVals)
returns the DBCell with the best type for an element encoded as a string
int integer() const
returns the DBcell as an integer
float real() const
returns the DBcell as a real number
static bool isReal(const std::string &str)
determine whether a string corresponds precisely to a real number
bool operator!=(const DBCell &from) const
test of inequality
DBCell & operator=(const DBCell &from)
copy operator
DBCell(const DBCell &from)
copy constructor
void setReal(const float x)
sets the content of the DBCell
DBCell(DBCell &&from)
move constructor
static EltType bestType(const std::string &str, const std::vector< std::string, ALLOC< std::string > > &missingVals)
returns the best type to store a given element encoded as a string
int stringIndex() const
returns the DBcell as the index of a string in a static bijection
void setInteger(const std::string &elt)
sets the content of the DBCell from a string
EltType type() const noexcept
returns the current type of the DBCell
bool operator==(const DBCell &from) const
test of equality
std::string toString(const std::vector< std::string, ALLOC< std::string > > &missingVals) const
returns the content of the DBCell as a string, whatever its type
void setInteger(const int x)
sets the content of the DBCell
DBCell & operator=(const std::string &x)
assignment operator
EltType
the set of types possibly taken by the last element read
Definition: DBCell.h:74
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
void setMissingState()
sets the DBCell as a missing element
DBCell()
default constructor (ontains a missing value)
DBCell & operator=(const int x)
assignment operator