aGrUM  0.14.1
recordCounter.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2005 by Christophe GONZALES and Pierre-Henri WUILLEMIN *
3  * {prenom.nom}_at_lip6.fr *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 2 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
28 #ifndef GUM_LEARNING_RECORD_COUNTER_H
29 #define GUM_LEARNING_RECORD_COUNTER_H
30 
31 #include <vector>
32 #include <utility>
33 #include <sstream>
34 #include <string>
35 
36 #include <agrum/agrum.h>
37 #include <agrum/core/bijection.h>
38 #include <agrum/core/sequence.h>
39 #include <agrum/core/OMPThreads.h>
40 #include <agrum/core/threadData.h>
41 #include <agrum/graphs/DAG.h>
44 
45 
46 namespace gum {
47 
48  namespace learning {
49 
109  template < template < typename > class ALLOC = std::allocator >
111  public:
113  using allocator_type = ALLOC< NodeId >;
114 
115  // ##########################################################################
117  // ##########################################################################
119 
121 
141  const DBRowGeneratorParser< ALLOC >& parser,
142  const std::vector< std::pair< std::size_t, std::size_t >,
143  ALLOC< std::pair< std::size_t, std::size_t > > >&
144  ranges,
145  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
146  nodeId2columns =
147  Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
148  const allocator_type& alloc = allocator_type());
149 
151 
165  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
166  nodeId2columns =
167  Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
168  const allocator_type& alloc = allocator_type());
169 
172 
175  const allocator_type& alloc);
176 
179 
182 
184  virtual RecordCounter< ALLOC >* clone() const;
185 
187  virtual RecordCounter< ALLOC >* clone(const allocator_type& alloc) const;
188 
190  virtual ~RecordCounter();
191 
193 
194 
195  // ##########################################################################
197  // ##########################################################################
198 
200 
203 
206 
208 
209 
210  // ##########################################################################
212  // ##########################################################################
213 
215 
217  void clear();
218 
220  void setMaxNbThreads(const std::size_t nb) const;
221 
223  std::size_t nbThreads() const;
224 
234  void setMinNbRowsPerThread(const std::size_t nb) const;
235 
237  std::size_t minNbRowsPerThread() const;
238 
240 
277  const std::vector< double, ALLOC< double > >&
278  counts(const IdSet< ALLOC >& ids, const bool check_discrete_vars = false);
279 
281 
287  template < template < typename > class XALLOC >
288  void setRanges(
289  const std::vector< std::pair< std::size_t, std::size_t >,
290  XALLOC< std::pair< std::size_t, std::size_t > > >&
291  new_ranges);
292 
294  void clearRanges();
295 
297  const std::vector< std::pair< std::size_t, std::size_t >,
298  ALLOC< std::pair< std::size_t, std::size_t > > >&
299  ranges() const;
300 
302 
305  template < typename GUM_SCALAR >
306  void setBayesNet(const BayesNet< GUM_SCALAR >& new_bn);
307 
310 
312 
316  nodeId2Columns() const;
317 
319  const DatabaseTable< ALLOC >& database() const;
320 
322 
323 
324 #ifndef DOXYGEN_SHOULD_SKIP_THIS
325 
326  private:
327  // the parsers used by the threads
328  std::vector< ThreadData< DBRowGeneratorParser< ALLOC > >,
329  ALLOC< ThreadData< DBRowGeneratorParser< ALLOC > > > >
330  __parsers;
331 
332  // the set of ranges of the database's rows indices over which the user
333  // wishes to perform the countings
334  std::vector< std::pair< std::size_t, std::size_t >,
335  ALLOC< std::pair< std::size_t, std::size_t > > >
336  __ranges;
337 
338  // the ranges actually used by the threads: there is a hopefully clever
339  // algorithm that split the rows ranges into another set of ranges that
340  // are assigned to the threads. For instance, if the database has 1000
341  // rows and there are 10 threads, each one will be assed a set of 100
342  // rows. These sets are precisely what are stored in the field below
343  mutable std::vector< std::pair< std::size_t, std::size_t >,
344  ALLOC< std::pair< std::size_t, std::size_t > > >
345  __thread_ranges;
346 
347  // the mapping from the NodeIds of the variables to the indices of the
348  // columns in the database
350 
351  // the last database-parsed countings
352  std::vector< double, ALLOC< double > > __last_DB_countings;
353 
354  // the ids of the nodes for the last database-parsed countings
355  IdSet< ALLOC > __last_DB_ids;
356 
357  // the last countings deduced from __last_DB_countings
358  std::vector< double, ALLOC< double > > __last_nonDB_countings;
359 
360  // the ids of the nodes of last countings deduced from __last_DB_countings
361  IdSet< ALLOC > __last_nonDB_ids;
362 
363  // the maximal number of threads that the record counter can use
364  mutable std::size_t __max_nb_threads{
365  std::size_t(gum::getMaxNumberOfThreads())};
366 
367  // the min number of rows that a thread should process in a
368  // multithreading context
369  mutable std::size_t __min_nb_rows_per_thread{100};
370 
371  // returns a mapping from the nodes ids to the columns of the database
372  // for a given sequence of ids. This is especially convenient when
373  // __nodeId2columns is empty (which means that there is an identity mapping)
375  __getNodeIds2Columns(const IdSet< ALLOC >& ids) const;
376 
378  std::vector< double, ALLOC< double > >& __extractFromCountings(
379  const IdSet< ALLOC >& subset_ids,
380  const IdSet< ALLOC >& superset_ids,
381  const std::vector< double, ALLOC< double > >& superset_vect);
382 
384  std::vector< double, ALLOC< double > >&
385  __countFromDatabase(const IdSet< ALLOC >& ids);
386 
388  void __threadedCount(
389  const std::size_t range_begin,
390  const std::size_t range_end,
392  const std::vector< std::pair< std::size_t, std::size_t >,
393  ALLOC< std::pair< std::size_t, std::size_t > > >&
394  cols_and_offsets,
395  std::vector< double, ALLOC< double > >& countings);
396 
398 
401  template < template < typename > class XALLOC >
402  void __checkRanges(
403  const std::vector< std::pair< std::size_t, std::size_t >,
404  XALLOC< std::pair< std::size_t, std::size_t > > >&
405  new_ranges) const;
406 
408 
410  void __checkDiscreteVariables(const IdSet< ALLOC >& ids) const;
411 
413 
416  void __raiseCheckException(
417  const std::vector< std::string, ALLOC< std::string > >& bad_vars) const;
418 
420  void __dispatchRangesToThreads();
421 
422 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
423  };
424 
425  } /* namespace learning */
426 
427 } /* namespace gum */
428 
431 
432 #endif /* GUM_LEARNING_RECORD_COUNTER_H */
Class representing a Bayesian Network.
Definition: BayesNet.h:76
A wrapper that enables to store data in a way that prevents false cacheline sharing.
virtual RecordCounter< ALLOC > * clone() const
virtual copy constructor
Header file of gum::Sequence, a class for storing (ordered) sequences of objects. ...
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > & nodeId2Columns() const
returns the mapping from ids to column positions in the database
void clearRanges()
reset the ranges to the one range corresponding to the whole database
std::size_t nbThreads() const
returns the number of threads used to parse the database
A class for storing a pair of sets of NodeIds, the second one corresponding to a conditional set...
Definition: idSet.h:45
allocator_type getAllocator() const
returns the allocator used
unsigned int getMaxNumberOfThreads()
Returns the maximum number of threads at any time.
A class used by learning caches to represent uniquely sets of variables.
gum is the global namespace for all aGrUM entities
Definition: agrum.h:25
void setMaxNbThreads(const std::size_t nb) const
changes the max number of threads used to parse the database
The class for generic Hash Tables.
Definition: hashTable.h:676
const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > & ranges() const
returns the current ranges
void clear()
clears all the last database-parsed countings from memory
std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
The class that computes countings of observations from the database.
const std::vector< double, ALLOC< double > > & counts(const IdSet< ALLOC > &ids, const bool check_discrete_vars=false)
returns the counts over all the variables in an IdSet
ALLOC< NodeId > allocator_type
type for the allocators passed in arguments of methods
The class that computes countings of observations from the database.
void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context ...
Set of pairs of elements with fast search for both elements.
Definition: bijection.h:1803
The class representing a tabular database as used by learning tasks.
virtual ~RecordCounter()
destructor
Wrappers for OpenMP.
void setRanges(const std::vector< std::pair< std::size_t, std::size_t >, XALLOC< std::pair< std::size_t, std::size_t > > > &new_ranges)
sets new ranges to perform the countings
RecordCounter< ALLOC > & operator=(const RecordCounter< ALLOC > &from)
copy operator
The class for parsing DatabaseTable rows and generating output rows.
RecordCounter(const DBRowGeneratorParser< ALLOC > &parser, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
the class used to read a row in the database and to transform it into a set of DBRow instances that c...
Size NodeId
Type for node ids.
Definition: graphElements.h:97
Set of pairs of elements with fast search for both elements.
Base classes for directed acyclic graphs.
void setBayesNet(const BayesNet< GUM_SCALAR > &new_bn)
assign a new Bayes net to all the counter&#39;s generators depending on a BN
const DatabaseTable< ALLOC > & database() const
returns the database on which we perform the counts