aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
DBRowGenerator.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The base class for all DBRow generators
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_DBROW_GENERATOR_H
28 #define GUM_LEARNING_DBROW_GENERATOR_H
29 
30 #include <vector>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/database/DBRow.h>
34 #include <agrum/tools/database/DBTranslatedValue.h>
35 
36 namespace gum {
37 
38  namespace learning {
39 
40  /** @enum DBRowGeneratorGoal
41  * @headerfile DBRowGenerator.h <agrum/tools/database/DBRowGenerator.h>
42  * @brief the type of things that a DBRowGenerator is designed for
43  *
44  * @ingroup learning_database
45  */
46  enum class DBRowGeneratorGoal : char
47  {
48  // the generator's goal is only to remove all missing values
50 
51  // the generator does something else than just missing values
53  };
54 
55 
56  /** @class DBRowGenerator
57  * @headerfile DBRowGenerator.h <agrum/tools/database/DBRowGenerator.h>
58  * @ingroup learning_database
59  * @brief The base class for all DBRow generators
60  *
61  * A DBRowGenerator instance takes as input a DBRow containing
62  * DBTranslatedValue instances provided directly by a DatabaseTable or
63  * resulting from a DBRow generation by another DBRowGenerator. Then,
64  * it produces from 0 to several instances of DBRow of DBTranslatedValue.
65  * This is essentially useful to deal with missing values: during learning,
66  * when a DBRow contains some missing values, what should we do with it?
67  * Should we discard it? Should we use an EM algorithm to produce several
68  * DBRows weighted by their probability of occurrence? Should we use a
69  * K-means algorithm to produce only one DBRow of highest probability of
70  * occurrence? Using the appropriate DBRowGenerator, you can apply any
71  * of these rules when your learning algorithm parses the DatabaseTable.
72  * You just need to indicate which DBRowGenerator to use, no line of
73  * code needs be changed in your high-level learning algorithm.
74  *
75  * As an example of how a DBRowGenerator works, an "Identity" DBRowGenerator
76  * takes as input a DBRow and returns it without any further processing, so
77  * it "produces" only one output DBRow. An EM DBRowGenerator takes in input
78  * a DBRow in which some cells may be missing. In this case, it produces all
79  * the possible combinations of values that these missing values may take and
80  * it assigns to these combinations a weight proportional to their probability
81  * of occurrence according to a given model. As such, it may most often produce
82  * several output DBRows.
83  *
84  * The standard usage of a DBRowGenerator is the following:
85  * @code
86  * // create a DatabaseTable and fill it
87  * gum::learning::DBTranslatorSet<> set;
88  * for ( int i = 0; i < 10; ++i )
89  * set.insertTranslator(gum::learning::DBTranslator4LabelizedVariable<>(),i);
90  * gum::learning::DatabaseTable<> database ( set );
91  * // fill the database
92  *
93  * // keep in a vector the types of the columns in the database
94  * const std::vector<gum::learning::DBTranslatedValueType>
95  * column_types ( 10, gum::learning::DBTranslatedValueType::DISCRETE );
96  *
97  * // create the generator
98  * gum::learning::DBRowGeneratorIdentity<> generator ( col_types );
99  *
100  * // parse the database and produce output rows
101  * for ( auto dbrow : database ) {
102  * generator.setInputRow ( dbrow );
103  * while ( generator.hasRows () ) {
104  * const auto& output_dbrow = generator.generate ();
105  * // do something with the output dbrow
106  * }
107  * }
108  * @endcode
109  *
110  * All DBRowGenerator classes should derive from this class. It takes care
111  * of the interaction with the RecordCounter / Score classes. The user
112  * who wishes to create a new DBRowGenerator, say for instance, one that
113  * outputs k times the input row, just has to define the following class
114  * (not all the constructors/destructors are required, but we provide them
115  * for self-consistency), the important part of which is located from the
116  * "Accessors / Modifiers" section on:
117  * @code
118  * template <template<typename> class ALLOC = std::allocator>
119  * class DuplicateGenerator : public DBRowGenerator<ALLOC> {
120  * public:
121  *
122  * /// type for the allocators passed in arguments of methods
123  * using allocator_type = ALLOC<DBTranslatedValue>;
124  *
125  * // ######################################################################
126  * // Constructors / Destructors
127  * // ######################################################################
128  *
129  * /// default constructor
130  * DuplicateGenerator( const std::vector<DBTranslatedValueType,
131  * ALLOC<DBTranslatedValueType>> column_types,
132  * const std::size_t nb_duplicates,
133  * const allocator_type& alloc = allocator_type () )
134  * : DBRowGenerator<ALLOC> ( column_types, alloc )
135  * , nb_duplicates__ ( nb_duplicates ) {}
136  *
137  * /// copy constructor with a given allocator
138  * DuplicateGenerator( const DuplicateGenerator<ALLOC>& from,
139  * const allocator_type& alloc )
140  * : DBRowGenerator<ALLOC>( from, alloc )
141  * , input_row__( from.input_row__ )
142  * , nb_duplicates__ ( from.nb_duplicates__ ) {}
143  *
144  * /// copy constructor
145  * DuplicateGenerator( const DuplicateGenerator<ALLOC>& from )
146  * : DuplicateGenerator<ALLOC> ( from, from.getAllocator () ) {}
147  *
148  * /// move constructor with a given allocator
149  * DuplicateGenerator( DuplicateGenerator<ALLOC>&& from,
150  * const allocator_type& alloc )
151  * : DBRowGenerator<ALLOC> ( std::move( from ), alloc )
152  * , input_row__( from.input_row__ )
153  * , nb_duplicates__ ( from.nb_duplicates__ ) {}
154  *
155  * /// move constructor
156  * DuplicateGenerator( DuplicateGenerator<ALLOC>&& from )
157  * : DuplicateGenerator<ALLOC> ( std::move(from), from.getAllocator() ) {}
158  *
159  * /// virtual copy constructor with a given allocator
160  * virtual DuplicateGenerator<ALLOC>*
161  * clone ( const allocator_type& alloc ) const {
162  * ALLOC<DuplicateGenerator<ALLOC>> allocator ( alloc );
163  * DuplicateGenerator<ALLOC>* generator = allocator.allocate(1);
164  * try { allocator.construct ( generator, *this, alloc ); }
165  * catch ( ... ) {
166  * allocator.deallocate ( generator, 1 );
167  * throw;
168  * }
169  * return generator;
170  * }
171  *
172  *
173  * /// virtual copy constructor
174  * virtual DuplicateGenerator<ALLOC>* clone () const {
175  * return clone ( this->getAllocator () );
176  * }
177  *
178  * /// destructor
179  * ~DuplicateGenerator() {}
180  *
181  *
182  * // ######################################################################
183  * // Operators
184  * // ######################################################################
185  *
186  * /// copy operator
187  * DuplicateGenerator<ALLOC>&
188  * operator=( const DuplicateGenerator<ALLOC>& from ) {
189  * DBRowGenerator<ALLOC>::operator=( from );
190  * input_row__ = from.input_row__;
191  * nb_duplicates__ = from.nb_duplicates__;
192  * return *this;
193  * }
194  *
195  * /// move operator
196  * DuplicateGenerator<ALLOC>& operator=( DuplicateGenerator<ALLOC>&& from ) {
197  * DBRowGenerator<ALLOC>::operator=( std::move( from ) );
198  * input_row__ = from.input_row__;
199  * nb_duplicates__ = from.nb_duplicates__;
200  * return *this;
201  * }
202  *
203  *
204  * // ######################################################################
205  * // Accessors / Modifiers
206  * // ######################################################################
207  *
208  * /// generates new lines from those the generator gets in input
209  * virtual const DBRow<DBTranslatedValue,ALLOC>& generate() final {
210  * this->decreaseRemainingRows();
211  * return *input_row__;
212  * }
213  *
214  *
215  * protected:
216  *
217  * /// computes the rows it will provide in output
218  * virtual std::size_t
219  * computeRows_( const DBRow<DBTranslatedValue,ALLOC>& row ) final {
220  * input_row__ = &row;
221  * return nb_duplicates__;
222  * }
223  *
224  *
225  * private:
226  * /// the row used as input to generate the output DBRows
227  * const DBRow<DBTranslatedValue,ALLOC>* input_row__ { nullptr };
228  *
229  * /// the number of times we return each input row
230  * std::size_t nb_duplicates__ { std::size_t(1) };
231  * };
232  * @endcode
233  */
234  template < template < typename > class ALLOC = std::allocator >
236  public:
237  /// type for the allocators passed in arguments of methods
239 
240  // ##########################################################################
241  /// @name Constructors / Destructors
242  // ##########################################################################
243 
244  /// @{
245 
246  /// default constructor
247  /** @param column_types indicates for each column whether this is a
248  * continuous or a discrete one
249  * @param alloc the allocator used by all the methods */
252  column_types,
253  const DBRowGeneratorGoal goal,
254  const allocator_type& alloc = allocator_type());
255 
256  /// copy constructor
257  DBRowGenerator(const DBRowGenerator< ALLOC >& from);
258 
259  /// copy constructor with a given allocator
260  DBRowGenerator(const DBRowGenerator< ALLOC >& from,
261  const allocator_type& alloc);
262 
263  /// move constructor
264  DBRowGenerator(DBRowGenerator< ALLOC >&& from);
265 
266  /// move constructor with a given allocator
267  DBRowGenerator(DBRowGenerator< ALLOC >&& from, const allocator_type& alloc);
268 
269  /// virtual copy constructor
270  virtual DBRowGenerator< ALLOC >* clone() const = 0;
271 
272  /// virtual copy constructor with a given allocator
273  virtual DBRowGenerator< ALLOC >*
274  clone(const allocator_type& alloc) const = 0;
275 
276  /// destructor
277  virtual ~DBRowGenerator();
278 
279  /// @}
280 
281 
282  // ##########################################################################
283  /// @name Accessors / Modifiers
284  // ##########################################################################
285 
286  /// @{
287 
288  /** @brief returns true if there are still rows that can be output by
289  * the DBRowGenerator */
290  bool hasRows();
291 
292  /// sets the input row from which the generator will create its output rows
293  /** @return a Boolean indicating whether, from this input DBRow, the
294  * DBRowGenerator is capable of outputing at least one row or not */
295  bool setInputRow(const DBRow< DBTranslatedValue, ALLOC >& row);
296 
297  /// generate new rows from the input row
298  virtual const DBRow< DBTranslatedValue, ALLOC >& generate() = 0;
299 
300  /// decrease the number of remaining output rows
301  /** When method setInputRow is performed, the DBRowGenerator knows how
302  * many output rows it will be able to generate. Each time method
303  * decreaseRemainingRows is called, we decrement this number. When the
304  * number becomes equal to 0, then there remains no new output row to
305  * generate. */
306  void decreaseRemainingRows();
307 
308  /// resets the generator. There are therefore no more ouput row to generate
309  virtual void reset();
310 
311  /** @brief sets the columns of interest: the output DBRow needs only
312  * contain correct values fot these columns
313  *
314  * This method is useful, e.g., for EM-like algorithms that need to know
315  * which unobserved variables/values need be filled. In this case, the
316  * DBRowGenerator still outputs DBRows with the same columns as the
317  * DatabaseTable, but only the columns of these DBRows corresponding to
318  * those passed in argument to Method setColumnsOfInterest are meaningful.
319  * For instance, if a DatabaseTable contains 10 columns and Method
320  * setColumnsOfInterest() is applied with vector<> { 0, 3, 4 }, then the
321  * DBRowGenerator will output DBRows with 10 columns, in which only
322  * columns 0, 3 and 4 are guaranteed to have correct values (columns are
323  * always indexed, starting from 0).
324  */
325  virtual void setColumnsOfInterest(
326  const std::vector< std::size_t, ALLOC< std::size_t > >& cols_of_interest);
327 
328  /** @brief sets the columns of interest: the output DBRow needs only
329  * contain correct values fot these columns
330  *
331  * This method is useful, e.g., for EM-like algorithms that need to know
332  * which unobserved variables/values need be filled. In this case, the
333  * DBRowGenerator still outputs DBRows with the same columns as the
334  * DatabaseTable, but only the columns of these DBRows corresponding to
335  * those passed in argument to Method setColumnsOfInterest are meaningful.
336  * For instance, if a DatabaseTable contains 10 columns and Method
337  * setColumnsOfInterest() is applied with vector<> { 0, 3, 4 }, then the
338  * DBRowGenerator will output DBRows with 10 columns, in which only
339  * columns 0, 3 and 4 are guaranteed to have correct values (columns are
340  * always indexed, starting from 0).
341  */
342  virtual void setColumnsOfInterest(
343  std::vector< std::size_t, ALLOC< std::size_t > >&& cols_of_interest);
344 
345  /// returns the current set of columns of interest
346  const std::vector< std::size_t, ALLOC< std::size_t > >&
347  columnsOfInterest() const;
348 
349  /// returns the allocator used
351 
352  /// returns the goal of the DBRowGenerator
353  DBRowGeneratorGoal goal() const;
354 
355  /// @}
356 
357 
358  protected:
359  /// the number of output rows still to retrieve through the generate method
360  std::size_t nb_remaining_output_rows_{std::size_t(0)};
361 
362  /// the types of the columns in the DatabaseTable
363  /** This is useful to determine whether we need to use the .discr_val
364  * field or the .cont_val field in DBTranslatedValue instances. */
367 
368  /// the set of columns of interest
369  std::vector< std::size_t, ALLOC< std::size_t > > columns_of_interest_;
370 
371  /// the goal of the DBRowGenerator (just remove missing values or not)
374 
375 
376  /// copy constructor
378 
379  /// move constructor
381 
382  /** @brief the method that computes the set of DBRow instances to output
383  * after method setInputRow has been called */
384  virtual std::size_t
385  computeRows_(const DBRow< DBTranslatedValue, ALLOC >& row)
386  = 0;
387  };
388 
389  } /* namespace learning */
390 
391 } /* namespace gum */
392 
393 
394 // always include the template implementation
395 #include <agrum/tools/database/DBRowGenerator_tpl.h>
396 
397 #endif /* GUM_LEARNING_DBROW_GENERATOR_H */
bool setInputRow(const DBRow< DBTranslatedValue, ALLOC > &row)
sets the input row from which the generator will create its output rows
DBRowGeneratorGoal
the type of things that a DBRowGenerator is designed for
DBRowGenerator(const DBRowGenerator< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
DBRowGenerator(const DBRowGenerator< ALLOC > &from)
copy constructor
DBRowGenerator< ALLOC > & operator=(const DBRowGenerator< ALLOC > &)
copy constructor
void decreaseRemainingRows()
decrease the number of remaining output rows
virtual ~DBRowGenerator()
destructor
virtual DBRowGenerator< ALLOC > * clone() const =0
virtual copy constructor
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
virtual void setColumnsOfInterest(const std::vector< std::size_t, ALLOC< std::size_t > > &cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
DBRowGenerator(DBRowGenerator< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual DBRowGenerator< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
virtual void setColumnsOfInterest(std::vector< std::size_t, ALLOC< std::size_t > > &&cols_of_interest)
sets the columns of interest: the output DBRow needs only contain correct values fot these columns ...
virtual void reset()
resets the generator. There are therefore no more ouput row to generate
DBRowGeneratorGoal goal() const
returns the goal of the DBRowGenerator
DBRowGenerator< ALLOC > & operator=(DBRowGenerator< ALLOC > &&)
move constructor
DBRowGenerator(DBRowGenerator< ALLOC > &&from)
move constructor
bool hasRows()
returns true if there are still rows that can be output by the DBRowGenerator
DBRowGeneratorGoal goal_
the goal of the DBRowGenerator (just remove missing values or not)
allocator_type getAllocator() const
returns the allocator used
std::vector< DBTranslatedValueType, ALLOC< DBTranslatedValueType > > column_types_
the types of the columns in the DatabaseTable
DBRowGenerator(const std::vector< DBTranslatedValueType, ALLOC< DBTranslatedValueType > > column_types, const DBRowGeneratorGoal goal, const allocator_type &alloc=allocator_type())
default constructor
const std::vector< std::size_t, ALLOC< std::size_t > > & columnsOfInterest() const
returns the current set of columns of interest
std::vector< std::size_t, ALLOC< std::size_t > > columns_of_interest_
the set of columns of interest
virtual std::size_t computeRows_(const DBRow< DBTranslatedValue, ALLOC > &row)=0
the method that computes the set of DBRow instances to output after method setInputRow has been calle...
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
std::size_t nb_remaining_output_rows_
the number of output rows still to retrieve through the generate method
virtual const DBRow< DBTranslatedValue, ALLOC > & generate()=0
generate new rows from the input row