aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
independenceTest.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the base class for all the independence tests used for learning
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_INDEPENDENCE_TEST_H
28 #define GUM_LEARNING_INDEPENDENCE_TEST_H
29 
30 #include <utility>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/core/math/math_utils.h>
34 #include <agrum/tools/core/OMPThreads.h>
35 
36 #include <agrum/tools/stattests/recordCounter.h>
37 #include <agrum/BN/learning/scores_and_tests/scoringCache.h>
38 #include <agrum/BN/learning/aprioris/apriori.h>
39 #include <agrum/BN/learning/structureUtils/graphChange.h>
40 
41 namespace gum {
42 
43  namespace learning {
44 
45  /** @class IndependenceTest
46  * @brief The base class for all the independence tests used for learning
47  * @headerfile independenceTest.h <agrum/BN/learning/scores_and_tests/independenceTest.h>
48  * @ingroup learning_scores
49  */
50  template < template < typename > class ALLOC = std::allocator >
52  public:
53  /// type for the allocators passed in arguments of methods
55 
56  // ##########################################################################
57  /// @name Constructors / Destructors
58  // ##########################################################################
59  /// @{
60 
61  /// default constructor
62  /** @param parser the parser used to parse the database
63  * @param external_apriori An apriori that we add to the computation of
64  * the score (this should come from expert knowledge): this consists in
65  * adding numbers to countings in the contingency tables
66  * @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
67  * indices. The countings are then performed only on the union of the
68  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
69  * cross validation tasks, in which part of the database should be ignored.
70  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
71  * the whole database.
72  * @param nodeId2Columns a mapping from the ids of the nodes in the
73  * graphical model to the corresponding column in the DatabaseTable
74  * parsed by the parser. This enables estimating from a database in
75  * which variable A corresponds to the 2nd column the parameters of a BN
76  * in which variable A has a NodeId of 5. An empty nodeId2Columns
77  * bijection means that the mapping is an identity, i.e., the value of a
78  * NodeId is equal to the index of the column in the DatabaseTable.
79  * @param alloc the allocator used to allocate the structures within the
80  * IndependenceTest.
81  * @warning If nodeId2columns is not empty, then only the scores over the
82  * ids belonging to this bijection can be computed: applying method
83  * score() over other ids will raise exception NotFound. */
85  const DBRowGeneratorParser< ALLOC >& parser,
86  const Apriori< ALLOC >& external_apriori,
87  const std::vector< std::pair< std::size_t, std::size_t >,
88  ALLOC< std::pair< std::size_t, std::size_t > > >&
89  ranges,
90  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
92  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
94 
95 
96  /// default constructor
97  /** @param parser the parser used to parse the database
98  * @param external_apriori An apriori that we add to the computation of
99  * the score (this should come from expert knowledge): this consists in
100  * adding numbers to countings in the contingency tables
101  * @param nodeId2Columns a mapping from the ids of the nodes in the
102  * graphical model to the corresponding column in the DatabaseTable
103  * parsed by the parser. This enables estimating from a database in
104  * which variable A corresponds to the 2nd column the parameters of a BN
105  * in which variable A has a NodeId of 5. An empty nodeId2Columns
106  * bijection means that the mapping is an identity, i.e., the value of a
107  * NodeId is equal to the index of the column in the DatabaseTable.
108  * @param alloc the allocator used to allocate the structures within the
109  * IndependenceTest.
110  * @warning If nodeId2columns is not empty, then only the scores over the
111  * ids belonging to this bijection can be computed: applying method
112  * score() over other ids will raise exception NotFound. */
114  const DBRowGeneratorParser< ALLOC >& parser,
115  const Apriori< ALLOC >& external_apriori,
116  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
118  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
119  const allocator_type& alloc = allocator_type());
120 
121  /// virtual copy constructor
122  virtual IndependenceTest< ALLOC >* clone() const = 0;
123 
124  /// virtual copy constructor with a given allocator
125  virtual IndependenceTest< ALLOC >*
126  clone(const allocator_type& alloc) const = 0;
127 
128  /// destructor
129  virtual ~IndependenceTest();
130 
131  /// @}
132 
133 
134  // ##########################################################################
135  /// @name Accessors / Modifiers
136  // ##########################################################################
137  /// @{
138 
139  /// changes the max number of threads used to parse the database
140  virtual void setMaxNbThreads(std::size_t nb) const;
141 
142  /// returns the number of threads used to parse the database
143  virtual std::size_t nbThreads() const;
144 
145  /** @brief changes the number min of rows a thread should process in a
146  * multithreading context
147  *
148  * When computing score, several threads are used by record counters to
149  * perform countings on the rows of the database, the MinNbRowsPerThread
150  * method indicates how many rows each thread should at least process.
151  * This is used to compute the number of threads actually run. This number
152  * is equal to the min between the max number of threads allowed and the
153  * number of records in the database divided by nb. */
154  virtual void setMinNbRowsPerThread(const std::size_t nb) const;
155 
156  /// returns the minimum of rows that each thread should process
157  virtual std::size_t minNbRowsPerThread() const;
158 
159  /// sets new ranges to perform the countings used by the independence test
160  /** @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
161  * indices. The countings are then performed only on the union of the
162  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
163  * cross validation tasks, in which part of the database should be ignored.
164  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
165  * the whole database. */
166  template < template < typename > class XALLOC >
167  void setRanges(
168  const std::vector< std::pair< std::size_t, std::size_t >,
169  XALLOC< std::pair< std::size_t, std::size_t > > >&
170  new_ranges);
171 
172  /// reset the ranges to the one range corresponding to the whole database
173  void clearRanges();
174 
175  /// returns the current ranges
176  const std::vector< std::pair< std::size_t, std::size_t >,
177  ALLOC< std::pair< std::size_t, std::size_t > > >&
178  ranges() const;
179 
180 
181  /// returns the score of a pair of nodes
182  double score(const NodeId var1, const NodeId var2);
183 
184  /// returns the score of a pair of nodes given some other nodes
185  /** @param var1 the first variable on the left side of the conditioning bar
186  * @param var2 the second variable on the left side of the conditioning bar
187  * @param rhs_ids the set of variables on the right side of the
188  * conditioning bar */
189  double score(const NodeId var1,
190  const NodeId var2,
191  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids);
192 
193  /// clears all the data structures from memory, including the cache
194  virtual void clear();
195 
196  /// clears the current cache
197  virtual void clearCache();
198 
199  /// turn on/off the use of a cache of the previously computed score
200  virtual void useCache(const bool on_off);
201 
202  /// return the mapping between the columns of the database and the node ids
203  /** @warning An empty nodeId2Columns bijection means that the mapping is
204  * an identity, i.e., the value of a NodeId is equal to the index of the
205  * column in the DatabaseTable. */
206  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
207  nodeId2Columns() const;
208 
209  /// return the database used by the score
210  const DatabaseTable< ALLOC >& database() const;
211 
212  /// returns the allocator used by the score
214 
215  /// @}
216 
217 
218  protected:
219  /// 1 / log(2)
220  const double one_log2_{M_LOG2E};
221 
222  /// the expert knowledge a priori we add to the contingency tables
223  Apriori< ALLOC >* apriori_{nullptr};
224 
225  /// the record counter used for the countings over discrete variables
227 
228  /// the scoring cache
230 
231  /// a Boolean indicating whether we wish to use the cache
232  bool use_cache_{true};
233 
234  /// an empty vector
236 
237 
238  /// copy constructor
239  IndependenceTest(const IndependenceTest< ALLOC >& from);
240 
241  /// copy constructor with a given allocator
242  IndependenceTest(const IndependenceTest< ALLOC >& from,
243  const allocator_type& alloc);
244 
245  /// move constructor
246  IndependenceTest(IndependenceTest< ALLOC >&& from);
247 
248  /// move constructor with a given allocator
249  IndependenceTest(IndependenceTest< ALLOC >&& from,
250  const allocator_type& alloc);
251 
252  /// copy operator
254 
255  /// move operator
257 
258  /// returns the score for a given IdCondSet
259  /** @throws OperationNotAllowed is raised if the score does not support
260  * calling method score such an idset (due to too many/too few variables
261  * in the left hand side or the right hand side of the idset). */
262  virtual double score_(const IdCondSet< ALLOC >& idset) = 0;
263 
264  /// returns a counting vector where variables are marginalized from N_xyz
265  /** @param node_2_marginalize indicates which node(s) shall be marginalized:
266  * - 0 means that X should be marginalized
267  * - 1 means that Y should be marginalized
268  * - 2 means that Z should be marginalized
269  * @param X_size the domain size of variable X
270  * @param Y_size the domain size of variable Y
271  * @param Z_size the domain size of the set of conditioning variables Z
272  * @param N_xyz a counting vector of dimension X * Y * Z (in this order)
273  */
274  std::vector< double, ALLOC< double > >
276  const std::size_t X_size,
277  const std::size_t Y_size,
278  const std::size_t Z_size,
279  const std::vector< double, ALLOC< double > >& N_xyz) const;
280  };
281 
282  } /* namespace learning */
283 
284 } /* namespace gum */
285 
286 
287 #ifndef GUM_NO_EXTERN_TEMPLATE_CLASS
288 extern template class gum::learning::IndependenceTest<>;
289 #endif
290 
291 
292 /// include the template implementation
293 #include <agrum/tools/stattests/independenceTest_tpl.h>
294 
295 #endif /* GUM_LEARNING_INDEPENDENCE_TEST_H */
std::vector< double, ALLOC< double > > marginalize_(const std::size_t node_2_marginalize, const std::size_t X_size, const std::size_t Y_size, const std::size_t Z_size, const std::vector< double, ALLOC< double > > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
IndependenceTest(const IndependenceTest< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
IndependenceTest(const IndependenceTest< ALLOC > &from)
copy constructor
virtual std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > & ranges() const
returns the current ranges
virtual double score_(const IdCondSet< ALLOC > &idset)=0
returns the score for a given IdCondSet
virtual void clearCache()
clears the current cache
IndependenceTest< ALLOC > & operator=(IndependenceTest< ALLOC > &&from)
move operator
ScoringCache< ALLOC > cache_
the scoring cache
const DatabaseTable< ALLOC > & database() const
return the database used by the score
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > & nodeId2Columns() const
return the mapping between the columns of the database and the node ids
virtual void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context ...
IndependenceTest(IndependenceTest< ALLOC > &&from)
move constructor
IndependenceTest(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
allocator_type getAllocator() const
returns the allocator used by the score
void clearRanges()
reset the ranges to the one range corresponding to the whole database
virtual IndependenceTest< ALLOC > * clone() const =0
virtual copy constructor
bool use_cache_
a Boolean indicating whether we wish to use the cache
virtual void setMaxNbThreads(std::size_t nb) const
changes the max number of threads used to parse the database
IndependenceTest< ALLOC > & operator=(const IndependenceTest< ALLOC > &from)
copy operator
IndependenceTest(IndependenceTest< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
double score(const NodeId var1, const NodeId var2, const std::vector< NodeId, ALLOC< NodeId > > &rhs_ids)
returns the score of a pair of nodes given some other nodes
double score(const NodeId var1, const NodeId var2)
returns the score of a pair of nodes
virtual ~IndependenceTest()
destructor
virtual IndependenceTest< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
const double one_log2_
1 / log(2)
RecordCounter< ALLOC > counter_
the record counter used for the countings over discrete variables
virtual void useCache(const bool on_off)
turn on/off the use of a cache of the previously computed score
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
virtual std::size_t nbThreads() const
returns the number of threads used to parse the database
IndependenceTest(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
virtual void clear()
clears all the data structures from memory, including the cache
void setRanges(const std::vector< std::pair< std::size_t, std::size_t >, XALLOC< std::pair< std::size_t, std::size_t > > > &new_ranges)
sets new ranges to perform the countings used by the independence test
Apriori< ALLOC > * apriori_
the expert knowledge a priori we add to the contingency tables