aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
DBRowGeneratorSet.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief class for packing sets of generators
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DBROW_GENERATOR_SET_H
28 #define GUM_LEARNING_DBROW_GENERATOR_SET_H
29 
30 #include <vector>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/database/DBRow.h>
34 #include <agrum/tools/database/DBTranslatedValue.h>
35 #include <agrum/tools/database/DBRowGenerator.h>
36 #include <agrum/tools/database/DBRowGeneratorWithBN.h>
37 
38 namespace gum {
39 
40  namespace learning {
41 
42 
43  /** @class DBRowGeneratorSet
44  * @headerfile DBRowGeneratorSet.h <agrum/tools/database/DBRowGeneratorSet.h>
45  * @ingroup learning_database
46  * @brief The class used to pack sets of generators
47  *
48  * When learning Bayesian networks, the records of the train dataset are
49  * used to construct contingency tables that are either exploited in
50  * statistical conditional independence tests or in scores. To achieve this,
51  * the values of the DatabaseTable's records need all be observed, i.e.,
52  * there should be no missing value. When this is not the case, we need to
53  * decide what to do with the records (actually the DBRows) that contain
54  * missing values. Should we discard them? Should we use an EM algorithm to
55  * substitute them by several fully-observed DBRows weighted by their
56  * probability of occurrence? Should we use a K-means algorithm to substitute
57  * them by only one DBRow of highest probability of occurrence? DBRowGenerator
58  * classes are used to perform these substitutions. From one input DBRow,
59  * they can produce from 0 to several output DBRows. DBRowGenerator instances
60  * can be used in sequences, i.e., a first DBRowGenerator can, e.g., apply
61  * an EM algorithm to produce many output DBRows, then these DBRows can
62  * feed another DBRowGenerator that only keeps those whose weight is higher
63  * than a given threshold. The purpose of Class DBRowGeneratorSet is to
64  * contain this sequence of DBRowGenerator instances. The key idea is that it
65  * makes the parsing of the output DBRow generated easier. For instance, if
66  * we want to use a sequence of 2 generators, outputing 3 times and 4 times
67  * the DBRows they get in input respectively, we could use the following
68  * code:
69  * @code
70  * gum::learning::DatabaseTable<> database ( ... );
71  * gum::learning::DBRowGeneratorDuplicate<> generator3 ( col_types, 3 );
72  * gum::learning::DBRowGeneratorDuplicate<> generator4 ( col_types, 4 );
73  *
74  * for ( auto dbrow : database ) {
75  * generator3.setInputRow ( dbrow );
76  * while ( generator3.hasRows () ) {
77  * const auto& output3_dbrow = generator3.generate ();
78  * generator4.setInputRow ( output3_dbrow );
79  * while ( generator4.hasRows () ) {
80  * const auto& output4_dbrow = generator4.generate ();
81  * // do something with output4_dbrow
82  * }
83  * }
84  * }
85  * @endcode
86  * For each input DBRow of the DatabaseTable, these while loops output
87  * 3 x 4 = 12 identical DBRows.
88  * As can be seen, when several DBRowGenerator instances are to be used
89  * in sequence, the code is not very easy to write. The DBRowGeneratorSet
90  * simplifies the coding as follows:
91  * @code
92  * gum::learning::DatabaseTable<> database ( ... );
93  * gum::learning::DBRowGeneratorDuplicate<> generator3 ( col_types, 3 );
94  * gum::learning::DBRowGeneratorDuplicate<> generator4 ( col_types, 4 );
95  *
96  * DBRowGeneratorSet<> genset;
97  * genset.insertGenerator ( generator3 );
98  * genset.insertGenerator ( generator4 );
99  * for ( auto dbrow : database ) {
100  * genset.setInputRow ( dbrow );
101  * while ( genset.hasRows () ) {
102  * const auto& output_dbrow = genset.generate ();
103  * // do something with output_dbrow
104  * }
105  * }
106  * @endcode
107  * As can be seen, whatever the number of DBRowGenerator instances packed
108  * into the DBRowGeneratorSet, only one while loop is needed to
109  * parse all the generated output DBRow instances.
110  */
111  template < template < typename > class ALLOC = std::allocator >
113  public:
114  /// type for the allocators passed in arguments of methods
116 
117  // ##########################################################################
118  /// @name Constructors / Destructors
119  // ##########################################################################
120 
121  /// @{
122 
123  /// default constructor
124  DBRowGeneratorSet(const allocator_type& alloc = allocator_type());
125 
126  /// copy constructor
127  DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC >& from);
128 
129  /// copy constructor with a given allocator
130  DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC >& from, const allocator_type& alloc);
131 
132  /// move constructor
133  DBRowGeneratorSet(DBRowGeneratorSet< ALLOC >&& from);
134 
135  /// move constructor with a given allocator
136  DBRowGeneratorSet(DBRowGeneratorSet< ALLOC >&& from, const allocator_type& alloc);
137 
138  /// virtual copy constructor
139  virtual DBRowGeneratorSet< ALLOC >* clone() const;
140 
141  /// virtual copy constructor with a given allocator
142  virtual DBRowGeneratorSet< ALLOC >* clone(const allocator_type& alloc) const;
143 
144  /// destructor
145  virtual ~DBRowGeneratorSet();
146 
147  /// @}
148 
149 
150  // ##########################################################################
151  /// @name Operators
152  // ##########################################################################
153 
154  /// @{
155 
156  /// copy operator
158 
159  /// move operator
161 
162  /// returns the ith generator
163  /** @warning this operator assumes that there are at least i+1 generators.
164  * So, it won't check that the ith generator actually exists. If unsure,
165  * use method generatorSafe that performs this check. */
166  DBRowGenerator< ALLOC >& operator[](const std::size_t i);
167 
168  /// returns the ith generator
169  /** @warning this operator assumes that there are at least i+1 generators.
170  * So, it won't check that the ith generator actually exists. If unsure,
171  * use method generatorSafe that performs this check. */
172  const DBRowGenerator< ALLOC >& operator[](const std::size_t i) const;
173 
174  /// @}
175 
176 
177  // ##########################################################################
178  /// @name Accessors / Modifiers
179  // ##########################################################################
180 
181  /// @{
182 
183  /// inserts a new generator at the end of the set
184  /** @throw OperationNotAllowed is raised if the generator set has already
185  * started generating output rows and is currently in a state where the
186  * generation is not completed yet (i.e., we still need to call the
187  * generate() method to complete it). */
188  template < template < template < typename > class > class Generator >
189  void insertGenerator(const Generator< ALLOC >& generator);
190 
191  /// inserts a new generator at the ith position of the set
192  /** @throw OperationNotAllowed is raised if the generator set has already
193  * started generating output rows and is currently in a state where the
194  * generation is not completed yet (i.e., we still need to call the
195  * generate() method to complete it). */
196  template < template < template < typename > class > class Generator >
197  void insertGenerator(const Generator< ALLOC >& generator, const std::size_t i);
198 
199  /// returns the number of generators
200  std::size_t nbGenerators() const noexcept;
201 
202  /// returns the number of generators (alias for nbGenerators)
203  std::size_t size() const noexcept;
204 
205  /** @brief returns true if there are still rows that can be output
206  * by the set of generators */
207  bool hasRows();
208 
209  /// sets the input row from which the generators will create new rows
210  /** @return true if the set of generators is able to generate output
211  * rows from the input row passed in argument */
212  bool setInputRow(const DBRow< DBTranslatedValue, ALLOC >& input_row);
213 
214  /// generates a new output row from the input row
215  const DBRow< DBTranslatedValue, ALLOC >& generate();
216 
217  /// assign a new Bayes net to all the generators that depend on a BN
218  /** Typically, generators based on EM or K-means depend on a model to
219  * compute correctly their outputs. Method setBayesNet enables to
220  * update their BN model. */
221  template < typename GUM_SCALAR >
222  void setBayesNet(const BayesNet< GUM_SCALAR >& new_bn);
223 
224  /// resets all the generators
225  void reset();
226 
227  /// removes all the generators
228  void clear();
229 
230  /** @brief sets the columns of interest: the output DBRow needs only
231  * contain correct values fot these columns
232  *
233  * This method is useful, e.g., for EM-like algorithms that need to know
234  * which unobserved variables/values need be filled. In this case, the
235  * DBRowGenerator instances contained in the DBRowGeneratorSet still
236  * output DBRows with the same columns as the DatabaseTable, but only the
237  * columns of these DBRows corresponding to those passed in argument to
238  * Method setColumnsOfInterest are meaningful. For instance, if a
239  * DatabaseTable contains 10 columns and Method setColumnsOfInterest() is
240  * applied with vector<> { 0, 3, 4 }, then the DBRowGenerator instances
241  * contained in the DBRowGeneratorSet will output DBRows with 10 columns,
242  * in which only columns 0, 3 and 4 are guaranteed to have correct values
243  * (columns are always indexed, starting from 0).
244  *
245  * @throw OperationNotAllowed is raised if the generator set has already
246  * started generating output rows and is currently in a state where the
247  * generation is not completed yet (i.e., we still need to call the
248  * generate() method to complete it). */
250  const std::vector< std::size_t, ALLOC< std::size_t > >& cols_of_interest);
251 
252  /** @brief sets the columns of interest: the output DBRow needs only
253  * contain correct values fot these columns
254  *
255  * This method is useful, e.g., for EM-like algorithms that need to know
256  * which unobserved variables/values need be filled. In this case, the
257  * DBRowGenerator instances contained in the DBRowGeneratorSet still
258  * output DBRows with the same columns as the DatabaseTable, but only the
259  * columns of these DBRows corresponding to those passed in argument to
260  * Method setColumnsOfInterest are meaningful. For instance, if a
261  * DatabaseTable contains 10 columns and Method setColumnsOfInterest() is
262  * applied with vector<> { 0, 3, 4 }, then the DBRowGenerator instances
263  * contained in the DBRowGeneratorSet will output DBRows with 10 columns,
264  * in which only columns 0, 3 and 4 are guaranteed to have correct values
265  * (columns are always indexed, starting from 0).
266  *
267  * @throw OperationNotAllowed is raised if the generator set has already
268  * started generating output rows and is currently in a state where the
269  * generation is not completed yet (i.e., we still need to call the
270  * generate() method to complete it). */
271  void
272  setColumnsOfInterest(std::vector< std::size_t, ALLOC< std::size_t > >&& cols_of_interest);
273 
274  /// returns the current set of columns of interest
275  const std::vector< std::size_t, ALLOC< std::size_t > >& columnsOfInterest() const;
276 
277  /// returns the allocator used
279 
280  /// @}
281 
282 
283 #ifndef DOXYGEN_SHOULD_SKIP_THIS
284 
285  private:
286  // the vector of all the generators
288 
289  // the number of generators
290  std::size_t _nb_generators_{std::size_t(0)};
291 
292  // the next output row to return when method generate is called
293  const DBRow< DBTranslatedValue, ALLOC >* _output_row_{nullptr};
294 
295  // the generation of output rows can be viewed as the traversal of a
296  // tree: each node of the tree correspond to the input row received by
297  // a generator. So the root node is the row passed in argument to
298  // the setInputDBrow() Method. From these input rows, generators produce
299  // through their generate() method new output rows, which correspond to
300  // the input rows of the next level of the tree. If we traverse this tree
301  // in terms of generators rather than in terms of input rows, which makes
302  // sense knowing our data structures, we need to know whether we should
303  // call only Method generate() to move down the tree or whether we should
304  // call first Method setInputDBrow() and then generate(). Actually, when
305  // a generator receives a new input row, it should call Method
306  // setInputDBrow(), which updates its data structure in order to be able
307  // to subsequently produce new output rows using Method generate().
308  // So, we need to discriminate between the situation in which Method
309  // setInputDBrow() has already been called from the situation in which
310  // we first need to call setInputDBrow(). The following vector allows this
311  // discrimination: when its cells equal 0, we need to call setInputDBrow()
312  // first, else when they equal 1, we just need to call the generate()
313  // method.
314  std::vector< int, ALLOC< int > > _setInputRow_performed_;
315 
316 
317  /// parse the row generation tree to produce a new row
318  /** * @param input_row this is used only when i = 0. In this case, if the
319  * first generator has not performed yet its call to setInputDBRow, then
320  * we call this method, passing in argument the input_row
321  * @param i indicate the generator from which we start the traversal
322  */
323  bool _produceNextRow_(const DBRow< DBTranslatedValue, ALLOC >* input_row, std::size_t i);
324 
325 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
326  };
327 
328  } /* namespace learning */
329 
330 } /* namespace gum */
331 
332 // always include the template implementation
333 #include <agrum/tools/database/DBRowGeneratorSet_tpl.h>
334 
335 
336 #endif /* GUM_LEARNING_DBROW_GENERATOR_SET_H */
allocator_type getAllocator() const
returns the allocator used
std::size_t size() const noexcept
returns the number of generators (alias for nbGenerators)
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
The class used to pack sets of generators.
void insertGenerator(const Generator< ALLOC > &generator, const std::size_t i)
inserts a new generator at the ith position of the set
void clear()
removes all the generators
DBRowGeneratorSet(DBRowGeneratorSet< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
DBRowGeneratorSet< ALLOC > & operator=(DBRowGeneratorSet< ALLOC > &&from)
move operator
virtual DBRowGeneratorSet< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
DBRowGeneratorSet(DBRowGeneratorSet< ALLOC > &&from)
move constructor
void setBayesNet(const BayesNet< GUM_SCALAR > &new_bn)
assign a new Bayes net to all the generators that depend on a BN
void setColumnsOfInterest(const std::vector< std::size_t, ALLOC< std::size_t > > &cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
void setColumnsOfInterest(std::vector< std::size_t, ALLOC< std::size_t > > &&cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
DBRowGeneratorSet< ALLOC > & operator=(const DBRowGeneratorSet< ALLOC > &from)
copy operator
DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
const std::vector< std::size_t, ALLOC< std::size_t > > & columnsOfInterest() const
returns the current set of columns of interest
virtual DBRowGeneratorSet< ALLOC > * clone() const
virtual copy constructor
void reset()
resets all the generators
void insertGenerator(const Generator< ALLOC > &generator)
inserts a new generator at the end of the set
virtual ~DBRowGeneratorSet()
destructor
DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC > &from)
copy constructor
std::size_t nbGenerators() const noexcept
returns the number of generators
DBRowGenerator< ALLOC > & operator[](const std::size_t i)
returns the ith generator
const DBRow< DBTranslatedValue, ALLOC > & generate()
generates a new output row from the input row
const DBRowGenerator< ALLOC > & operator[](const std::size_t i) const
returns the ith generator
bool setInputRow(const DBRow< DBTranslatedValue, ALLOC > &input_row)
sets the input row from which the generators will create new rows
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
bool hasRows()
returns true if there are still rows that can be output by the set of generators