aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBRowGeneratorSet.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief class for packing sets of generators
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DBROW_GENERATOR_SET_H
28 #define GUM_LEARNING_DBROW_GENERATOR_SET_H
29 
30 #include <vector>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/database/DBRow.h>
34 #include <agrum/tools/database/DBTranslatedValue.h>
35 #include <agrum/tools/database/DBRowGenerator.h>
36 #include <agrum/tools/database/DBRowGeneratorWithBN.h>
37 
38 namespace gum {
39 
40  namespace learning {
41 
42 
43  /** @class DBRowGeneratorSet
44  * @headerfile DBRowGeneratorSet.h <agrum/tools/database/DBRowGeneratorSet.h>
45  * @ingroup learning_database
46  * @brief The class used to pack sets of generators
47  *
48  * When learning Bayesian networks, the records of the train dataset are
49  * used to construct contingency tables that are either exploited in
50  * statistical conditional independence tests or in scores. To achieve this,
51  * the values of the DatabaseTable's records need all be observed, i.e.,
52  * there should be no missing value. When this is not the case, we need to
53  * decide what to do with the records (actually the DBRows) that contain
54  * missing values. Should we discard them? Should we use an EM algorithm to
55  * substitute them by several fully-observed DBRows weighted by their
56  * probability of occurrence? Should we use a K-means algorithm to substitute
57  * them by only one DBRow of highest probability of occurrence? DBRowGenerator
58  * classes are used to perform these substitutions. From one input DBRow,
59  * they can produce from 0 to several output DBRows. DBRowGenerator instances
60  * can be used in sequences, i.e., a first DBRowGenerator can, e.g., apply
61  * an EM algorithm to produce many output DBRows, then these DBRows can
62  * feed another DBRowGenerator that only keeps those whose weight is higher
63  * than a given threshold. The purpose of Class DBRowGeneratorSet is to
64  * contain this sequence of DBRowGenerator instances. The key idea is that it
65  * makes the parsing of the output DBRow generated easier. For instance, if
66  * we want to use a sequence of 2 generators, outputing 3 times and 4 times
67  * the DBRows they get in input respectively, we could use the following
68  * code:
69  * @code
70  * gum::learning::DatabaseTable<> database ( ... );
71  * gum::learning::DBRowGeneratorDuplicate<> generator3 ( col_types, 3 );
72  * gum::learning::DBRowGeneratorDuplicate<> generator4 ( col_types, 4 );
73  *
74  * for ( auto dbrow : database ) {
75  * generator3.setInputRow ( dbrow );
76  * while ( generator3.hasRows () ) {
77  * const auto& output3_dbrow = generator3.generate ();
78  * generator4.setInputRow ( output3_dbrow );
79  * while ( generator4.hasRows () ) {
80  * const auto& output4_dbrow = generator4.generate ();
81  * // do something with output4_dbrow
82  * }
83  * }
84  * }
85  * @endcode
86  * For each input DBRow of the DatabaseTable, these while loops output
87  * 3 x 4 = 12 identical DBRows.
88  * As can be seen, when several DBRowGenerator instances are to be used
89  * in sequence, the code is not very easy to write. The DBRowGeneratorSet
90  * simplifies the coding as follows:
91  * @code
92  * gum::learning::DatabaseTable<> database ( ... );
93  * gum::learning::DBRowGeneratorDuplicate<> generator3 ( col_types, 3 );
94  * gum::learning::DBRowGeneratorDuplicate<> generator4 ( col_types, 4 );
95  *
96  * DBRowGeneratorSet<> genset;
97  * genset.insertGenerator ( generator3 );
98  * genset.insertGenerator ( generator4 );
99  * for ( auto dbrow : database ) {
100  * genset.setInputRow ( dbrow );
101  * while ( genset.hasRows () ) {
102  * const auto& output_dbrow = genset.generate ();
103  * // do something with output_dbrow
104  * }
105  * }
106  * @endcode
107  * As can be seen, whatever the number of DBRowGenerator instances packed
108  * into the DBRowGeneratorSet, only one while loop is needed to
109  * parse all the generated output DBRow instances.
110  */
111  template < template < typename > class ALLOC = std::allocator >
113  public:
114  /// type for the allocators passed in arguments of methods
116 
117  // ##########################################################################
118  /// @name Constructors / Destructors
119  // ##########################################################################
120 
121  /// @{
122 
123  /// default constructor
124  DBRowGeneratorSet(const allocator_type& alloc = allocator_type());
125 
126  /// copy constructor
127  DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC >& from);
128 
129  /// copy constructor with a given allocator
130  DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC >& from,
131  const allocator_type& alloc);
132 
133  /// move constructor
134  DBRowGeneratorSet(DBRowGeneratorSet< ALLOC >&& from);
135 
136  /// move constructor with a given allocator
137  DBRowGeneratorSet(DBRowGeneratorSet< ALLOC >&& from,
138  const allocator_type& alloc);
139 
140  /// virtual copy constructor
141  virtual DBRowGeneratorSet< ALLOC >* clone() const;
142 
143  /// virtual copy constructor with a given allocator
144  virtual DBRowGeneratorSet< ALLOC >* clone(const allocator_type& alloc) const;
145 
146  /// destructor
147  virtual ~DBRowGeneratorSet();
148 
149  /// @}
150 
151 
152  // ##########################################################################
153  /// @name Operators
154  // ##########################################################################
155 
156  /// @{
157 
158  /// copy operator
160  operator=(const DBRowGeneratorSet< ALLOC >& from);
161 
162  /// move operator
164 
165  /// returns the ith generator
166  /** @warning this operator assumes that there are at least i+1 generators.
167  * So, it won't check that the ith generator actually exists. If unsure,
168  * use method generatorSafe that performs this check. */
169  DBRowGenerator< ALLOC >& operator[](const std::size_t i);
170 
171  /// returns the ith generator
172  /** @warning this operator assumes that there are at least i+1 generators.
173  * So, it won't check that the ith generator actually exists. If unsure,
174  * use method generatorSafe that performs this check. */
175  const DBRowGenerator< ALLOC >& operator[](const std::size_t i) const;
176 
177  /// @}
178 
179 
180  // ##########################################################################
181  /// @name Accessors / Modifiers
182  // ##########################################################################
183 
184  /// @{
185 
186  /// inserts a new generator at the end of the set
187  /** @throw OperationNotAllowed is raised if the generator set has already
188  * started generating output rows and is currently in a state where the
189  * generation is not completed yet (i.e., we still need to call the
190  * generate() method to complete it). */
191  template < template < template < typename > class > class Generator >
192  void insertGenerator(const Generator< ALLOC >& generator);
193 
194  /// inserts a new generator at the ith position of the set
195  /** @throw OperationNotAllowed is raised if the generator set has already
196  * started generating output rows and is currently in a state where the
197  * generation is not completed yet (i.e., we still need to call the
198  * generate() method to complete it). */
199  template < template < template < typename > class > class Generator >
200  void insertGenerator(const Generator< ALLOC >& generator,
201  const std::size_t i);
202 
203  /// returns the number of generators
204  std::size_t nbGenerators() const noexcept;
205 
206  /// returns the number of generators (alias for nbGenerators)
207  std::size_t size() const noexcept;
208 
209  /** @brief returns true if there are still rows that can be output
210  * by the set of generators */
211  bool hasRows();
212 
213  /// sets the input row from which the generators will create new rows
214  /** @return true if the set of generators is able to generate output
215  * rows from the input row passed in argument */
216  bool setInputRow(const DBRow< DBTranslatedValue, ALLOC >& input_row);
217 
218  /// generates a new output row from the input row
219  const DBRow< DBTranslatedValue, ALLOC >& generate();
220 
221  /// assign a new Bayes net to all the generators that depend on a BN
222  /** Typically, generators based on EM or K-means depend on a model to
223  * compute correctly their outputs. Method setBayesNet enables to
224  * update their BN model. */
225  template < typename GUM_SCALAR >
226  void setBayesNet(const BayesNet< GUM_SCALAR >& new_bn);
227 
228  /// resets all the generators
229  void reset();
230 
231  /// removes all the generators
232  void clear();
233 
234  /** @brief sets the columns of interest: the output DBRow needs only
235  * contain correct values fot these columns
236  *
237  * This method is useful, e.g., for EM-like algorithms that need to know
238  * which unobserved variables/values need be filled. In this case, the
239  * DBRowGenerator instances contained in the DBRowGeneratorSet still
240  * output DBRows with the same columns as the DatabaseTable, but only the
241  * columns of these DBRows corresponding to those passed in argument to
242  * Method setColumnsOfInterest are meaningful. For instance, if a
243  * DatabaseTable contains 10 columns and Method setColumnsOfInterest() is
244  * applied with vector<> { 0, 3, 4 }, then the DBRowGenerator instances
245  * contained in the DBRowGeneratorSet will output DBRows with 10 columns,
246  * in which only columns 0, 3 and 4 are guaranteed to have correct values
247  * (columns are always indexed, starting from 0).
248  *
249  * @throw OperationNotAllowed is raised if the generator set has already
250  * started generating output rows and is currently in a state where the
251  * generation is not completed yet (i.e., we still need to call the
252  * generate() method to complete it). */
254  const std::vector< std::size_t, ALLOC< std::size_t > >& cols_of_interest);
255 
256  /** @brief sets the columns of interest: the output DBRow needs only
257  * contain correct values fot these columns
258  *
259  * This method is useful, e.g., for EM-like algorithms that need to know
260  * which unobserved variables/values need be filled. In this case, the
261  * DBRowGenerator instances contained in the DBRowGeneratorSet still
262  * output DBRows with the same columns as the DatabaseTable, but only the
263  * columns of these DBRows corresponding to those passed in argument to
264  * Method setColumnsOfInterest are meaningful. For instance, if a
265  * DatabaseTable contains 10 columns and Method setColumnsOfInterest() is
266  * applied with vector<> { 0, 3, 4 }, then the DBRowGenerator instances
267  * contained in the DBRowGeneratorSet will output DBRows with 10 columns,
268  * in which only columns 0, 3 and 4 are guaranteed to have correct values
269  * (columns are always indexed, starting from 0).
270  *
271  * @throw OperationNotAllowed is raised if the generator set has already
272  * started generating output rows and is currently in a state where the
273  * generation is not completed yet (i.e., we still need to call the
274  * generate() method to complete it). */
276  std::vector< std::size_t, ALLOC< std::size_t > >&& cols_of_interest);
277 
278  /// returns the current set of columns of interest
279  const std::vector< std::size_t, ALLOC< std::size_t > >&
280  columnsOfInterest() const;
281 
282  /// returns the allocator used
284 
285  /// @}
286 
287 
288 #ifndef DOXYGEN_SHOULD_SKIP_THIS
289 
290  private:
291  // the vector of all the generators
293  generators__;
294 
295  // the number of generators
296  std::size_t nb_generators__{std::size_t(0)};
297 
298  // the next output row to return when method generate is called
299  const DBRow< DBTranslatedValue, ALLOC >* output_row__{nullptr};
300 
301  // the generation of output rows can be viewed as the traversal of a
302  // tree: each node of the tree correspond to the input row received by
303  // a generator. So the root node is the row passed in argument to
304  // the setInputDBrow() Method. From these input rows, generators produce
305  // through their generate() method new output rows, which correspond to
306  // the input rows of the next level of the tree. If we traverse this tree
307  // in terms of generators rather than in terms of input rows, which makes
308  // sense knowing our data structures, we need to know whether we should
309  // call only Method generate() to move down the tree or whether we should
310  // call first Method setInputDBrow() and then generate(). Actually, when
311  // a generator receives a new input row, it should call Method
312  // setInputDBrow(), which updates its data structure in order to be able
313  // to subsequently produce new output rows using Method generate().
314  // So, we need to discriminate between the situation in which Method
315  // setInputDBrow() has already been called from the situation in which
316  // we first need to call setInputDBrow(). The following vector allows this
317  // discrimination: when its cells equal 0, we need to call setInputDBrow()
318  // first, else when they equal 1, we just need to call the generate()
319  // method.
320  std::vector< int, ALLOC< int > > setInputRow_performed__;
321 
322 
323  /// parse the row generation tree to produce a new row
324  /** * @param input_row this is used only when i = 0. In this case, if the
325  * first generator has not performed yet its call to setInputDBRow, then
326  * we call this method, passing in argument the input_row
327  * @param i indicate the generator from which we start the traversal
328  */
329  bool produceNextRow__(const DBRow< DBTranslatedValue, ALLOC >* input_row,
330  std::size_t i);
331 
332 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
333  };
334 
335  } /* namespace learning */
336 
337 } /* namespace gum */
338 
339 // always include the template implementation
340 #include <agrum/tools/database/DBRowGeneratorSet_tpl.h>
341 
342 
343 #endif /* GUM_LEARNING_DBROW_GENERATOR_SET_H */
allocator_type getAllocator() const
returns the allocator used
std::size_t size() const noexcept
returns the number of generators (alias for nbGenerators)
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
The class used to pack sets of generators.
void insertGenerator(const Generator< ALLOC > &generator, const std::size_t i)
inserts a new generator at the ith position of the set
void clear()
removes all the generators
DBRowGeneratorSet(DBRowGeneratorSet< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
DBRowGeneratorSet< ALLOC > & operator=(DBRowGeneratorSet< ALLOC > &&from)
move operator
virtual DBRowGeneratorSet< ALLOC > * clone(const allocator_type &alloc) const
virtual copy constructor with a given allocator
DBRowGeneratorSet(DBRowGeneratorSet< ALLOC > &&from)
move constructor
void setBayesNet(const BayesNet< GUM_SCALAR > &new_bn)
assign a new Bayes net to all the generators that depend on a BN
void setColumnsOfInterest(const std::vector< std::size_t, ALLOC< std::size_t > > &cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
void setColumnsOfInterest(std::vector< std::size_t, ALLOC< std::size_t > > &&cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
DBRowGeneratorSet< ALLOC > & operator=(const DBRowGeneratorSet< ALLOC > &from)
copy operator
DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
const std::vector< std::size_t, ALLOC< std::size_t > > & columnsOfInterest() const
returns the current set of columns of interest
virtual DBRowGeneratorSet< ALLOC > * clone() const
virtual copy constructor
void reset()
resets all the generators
void insertGenerator(const Generator< ALLOC > &generator)
inserts a new generator at the end of the set
virtual ~DBRowGeneratorSet()
destructor
DBRowGeneratorSet(const DBRowGeneratorSet< ALLOC > &from)
copy constructor
std::size_t nbGenerators() const noexcept
returns the number of generators
DBRowGenerator< ALLOC > & operator[](const std::size_t i)
returns the ith generator
const DBRow< DBTranslatedValue, ALLOC > & generate()
generates a new output row from the input row
const DBRowGenerator< ALLOC > & operator[](const std::size_t i) const
returns the ith generator
bool setInputRow(const DBRow< DBTranslatedValue, ALLOC > &input_row)
sets the input row from which the generators will create new rows
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
bool hasRows()
returns true if there are still rows that can be output by the set of generators