aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
pseudoCount.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the base class for all the independence tests used for learning
24  *
25  * @author Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
26  */
27 #ifndef GUM_LEARNING_PSEUDO_COUNT_H
28 #define GUM_LEARNING_PSEUDO_COUNT_H
29 
30 #include <utility>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/core/math/math_utils.h>
34 #include <agrum/tools/core/OMPThreads.h>
35 
36 #include <agrum/tools/stattests/recordCounter.h>
37 #include <agrum/BN/learning/aprioris/apriori.h>
38 #include <agrum/tools/variables/discreteVariable.h>
39 #include <agrum/tools/multidim/potential.h>
40 
41 namespace gum {
42 
43  namespace learning {
44 
45  /** @class PseudoCount
46  * @brief The class for giving access to pseudo count : count in the database +
47  * prior
48  * @headerfile PseudoCount.h <agrum/BN/learning/pseudo-counts_and_tests/PseudoCount.h>
49  * @ingroup learning_pseudo-counts
50  */
51  template < template < typename > class ALLOC = std::allocator >
52  class PseudoCount {
53  public:
54  /// type for the allocators passed in arguments of methods
56 
57  // ##########################################################################
58  /// @name Constructors / Destructors
59  // ##########################################################################
60  /// @{
61 
62  /// default constructor
63  /** @param parser the parser used to parse the database
64  * @param external_apriori An apriori that we add to the computation of
65  * the pseudo-count (this should come from expert knowledge): this consists
66  * in adding numbers to countings in the contingency tables
67  * @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
68  * indices. The countings are then performed only on the union of the
69  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
70  * cross validation tasks, in which part of the database should be ignored.
71  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
72  * the whole database.
73  * @param nodeId2Columns a mapping from the ids of the nodes in the
74  * graphical model to the corresponding column in the DatabaseTable
75  * parsed by the parser. This enables estimating from a database in
76  * which variable A corresponds to the 2nd column the parameters of a BN
77  * in which variable A has a NodeId of 5. An empty nodeId2Columns
78  * bijection means that the mapping is an identity, i.e., the value of a
79  * NodeId is equal to the index of the column in the DatabaseTable.
80  * @param alloc the allocator used to allocate the structures within the
81  * PseudoCount.
82  * @warning If nodeId2columns is not empty, then only the pseudo-counts over
83  * the ids belonging to this bijection can be computed: applying method
84  * pseudo-count() over other ids will raise exception NotFound. */
86  const DBRowGeneratorParser< ALLOC >& parser,
87  const Apriori< ALLOC >& external_apriori,
88  const std::vector< std::pair< std::size_t, std::size_t >,
89  ALLOC< std::pair< std::size_t, std::size_t > > >&
90  ranges,
91  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
93  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
95 
96 
97  /// default constructor
98  /** @param parser the parser used to parse the database
99  * @param external_apriori An apriori that we add to the computation of
100  * the pseudo-count (this should come from expert knowledge): this consists
101  * in adding numbers to countings in the contingency tables
102  * @param nodeId2Columns a mapping from the ids of the nodes in the
103  * graphical model to the corresponding column in the DatabaseTable
104  * parsed by the parser. This enables estimating from a database in
105  * which variable A corresponds to the 2nd column the parameters of a BN
106  * in which variable A has a NodeId of 5. An empty nodeId2Columns
107  * bijection means that the mapping is an identity, i.e., the value of a
108  * NodeId is equal to the index of the column in the DatabaseTable.
109  * @param alloc the allocator used to allocate the structures within the
110  * PseudoCount.
111  * @warning If nodeId2columns is not empty, then only the pseudo-counts over
112  * the ids belonging to this bijection can be computed: applying method
113  * pseudo-count() over other ids will raise exception NotFound. */
114  PseudoCount(const DBRowGeneratorParser< ALLOC >& parser,
115  const Apriori< ALLOC >& external_apriori,
116  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
118  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
119  const allocator_type& alloc = allocator_type());
120 
121  /// destructor
122  virtual ~PseudoCount();
123 
124 
125  /// copy constructor
126  PseudoCount(const PseudoCount< ALLOC >& from);
127 
128  /// copy constructor with a given allocator
129  PseudoCount(const PseudoCount< ALLOC >& from, const allocator_type& alloc);
130 
131  /// move constructor
132  PseudoCount(PseudoCount< ALLOC >&& from);
133 
134  /// move constructor with a given allocator
135  PseudoCount(PseudoCount< ALLOC >&& from, const allocator_type& alloc);
136 
137  /// copy operator
139 
140  /// move operator
142  /// @}
143 
144 
145  // ##########################################################################
146  /// @name Accessors / Modifiers
147  // ##########################################################################
148  /// @{
149 
150  /// changes the max number of threads used to parse the database
151  virtual void setMaxNbThreads(std::size_t nb) const;
152 
153  /// returns the number of threads used to parse the database
154  virtual std::size_t nbThreads() const;
155 
156  /** @brief changes the number min of rows a thread should process in a
157  * multithreading context
158  *
159  * When computing pseudo-count, several threads are used by record counters
160  * to perform countings on the rows of the database, the MinNbRowsPerThread
161  * method indicates how many rows each thread should at least process.
162  * This is used to compute the number of threads actually run. This number
163  * is equal to the min between the max number of threads allowed and the
164  * number of records in the database divided by nb. */
165  virtual void setMinNbRowsPerThread(const std::size_t nb) const;
166 
167  /// returns the minimum of rows that each thread should process
168  virtual std::size_t minNbRowsPerThread() const;
169 
170  /// sets new ranges to perform the countings used by the independence test
171  /** @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
172  * indices. The countings are then performed only on the union of the
173  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
174  * cross validation tasks, in which part of the database should be ignored.
175  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
176  * the whole database. */
177  template < template < typename > class XALLOC >
178  void setRanges(
179  const std::vector< std::pair< std::size_t, std::size_t >,
180  XALLOC< std::pair< std::size_t, std::size_t > > >&
181  new_ranges);
182 
183  /// reset the ranges to the one range corresponding to the whole database
184  void clearRanges();
185 
186  /// returns the current ranges
187  const std::vector< std::pair< std::size_t, std::size_t >,
188  ALLOC< std::pair< std::size_t, std::size_t > > >&
189  ranges() const;
190 
191  /// returns the pseudo-count of a pair of nodes given some other nodes
192  /** @param var1 the first variable on the left side of the conditioning bar
193  * @param var2 the second variable on the left side of the conditioning bar
194  * @param rhs_ids the set of variables on the right side of the
195  * conditioning bar */
196  std::vector< double, ALLOC< double > >
197  get(const std::vector< NodeId, ALLOC< NodeId > >& ids);
198 
199  /// clears all the data structures from memory, including the cache
200  virtual void clear();
201 
202  /// return the mapping between the columns of the database and the node ids
203  /** @warning An empty nodeId2Columns bijection means that the mapping is
204  * an identity, i.e., the value of a NodeId is equal to the index of the
205  * column in the DatabaseTable. */
206  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
207  nodeId2Columns() const;
208 
209  /// return the database used by the pseudo-count
210  const DatabaseTable< ALLOC >& database() const;
211 
212  /// returns the allocator used by the pseudo-count
214 
215  /// @}
216 
217 
218  protected:
219  /// the expert knowledge a priori we add to the contingency tables
220  Apriori< ALLOC >* apriori_{nullptr};
221 
222  /// the record counter used for the countings over discrete variables
224 
225  /// an empty vector
227  }; /* namespace learning */
228  } // namespace learning
229 } /* namespace gum */
230 
231 
232 #ifndef GUM_NO_EXTERN_TEMPLATE_CLASS
233 extern template class gum::learning::PseudoCount<>;
234 #endif
235 
236 
237 /// include the template implementation
238 #include <agrum/tools/stattests/pseudoCount_tpl.h>
239 
240 #endif /* GUM_LEARNING_PSEUDO_COUNT_H */
allocator_type getAllocator() const
returns the allocator used by the pseudo-count
void clearRanges()
reset the ranges to the one range corresponding to the whole database
const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > & ranges() const
returns the current ranges
PseudoCount(const PseudoCount< ALLOC > &from)
copy constructor
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
RecordCounter< ALLOC > counter_
the record counter used for the countings over discrete variables
Definition: pseudoCount.h:223
PseudoCount< ALLOC > & operator=(PseudoCount< ALLOC > &&from)
move operator
virtual void setMaxNbThreads(std::size_t nb) const
changes the max number of threads used to parse the database
PseudoCount(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
std::vector< double, ALLOC< double > > get(const std::vector< NodeId, ALLOC< NodeId > > &ids)
returns the pseudo-count of a pair of nodes given some other nodes
virtual void clear()
clears all the data structures from memory, including the cache
PseudoCount(PseudoCount< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
Apriori< ALLOC > * apriori_
the expert knowledge a priori we add to the contingency tables
Definition: pseudoCount.h:220
PseudoCount(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
const DatabaseTable< ALLOC > & database() const
return the database used by the pseudo-count
virtual void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context ...
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > & nodeId2Columns() const
return the mapping between the columns of the database and the node ids
void setRanges(const std::vector< std::pair< std::size_t, std::size_t >, XALLOC< std::pair< std::size_t, std::size_t > > > &new_ranges)
sets new ranges to perform the countings used by the independence test
PseudoCount(const PseudoCount< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
PseudoCount(PseudoCount< ALLOC > &&from)
move constructor
virtual std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
virtual std::size_t nbThreads() const
returns the number of threads used to parse the database
virtual ~PseudoCount()
destructor
PseudoCount< ALLOC > & operator=(const PseudoCount< ALLOC > &from)
copy operator