aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
score.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the base class for all the scores used for learning (BIC, BDeu, etc)
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 #ifndef GUM_LEARNING_SCORE_H
28 #define GUM_LEARNING_SCORE_H
29 
30 #include <utility>
31 
32 #include <agrum/agrum.h>
33 #include <agrum/tools/core/math/math_utils.h>
34 #include <agrum/tools/core/OMPThreads.h>
35 
36 #include <agrum/tools/stattests/recordCounter.h>
37 #include <agrum/BN/learning/scores_and_tests/scoringCache.h>
38 #include <agrum/BN/learning/aprioris/apriori.h>
39 #include <agrum/BN/learning/structureUtils/graphChange.h>
40 
41 namespace gum {
42 
43  namespace learning {
44 
45  /** @class Score
46  * @brief The base class for all the scores used for learning (BIC, BDeu, etc)
47  * @headerfile score.h <agrum/BN/learning/scores_and_tests/score.h>
48  * @ingroup learning_scores
49  */
50  template < template < typename > class ALLOC = std::allocator >
51  class Score {
52  public:
53  /// type for the allocators passed in arguments of methods
55 
56  // ##########################################################################
57  /// @name Constructors / Destructors
58  // ##########################################################################
59  /// @{
60 
61  /// default constructor
62  /** @param parser the parser used to parse the database
63  * @param external_apriori An apriori that we add to the computation of
64  * the score (this should come from expert knowledge)
65  * @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
66  * indices. The countings are then performed only on the union of the
67  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
68  * cross validation tasks, in which part of the database should be ignored.
69  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
70  * the whole database.
71  * @param nodeId2Columns a mapping from the ids of the nodes in the
72  * graphical model to the corresponding column in the DatabaseTable
73  * parsed by the parser. This enables estimating from a database in
74  * which variable A corresponds to the 2nd column the parameters of a BN
75  * in which variable A has a NodeId of 5. An empty nodeId2Columns
76  * bijection means that the mapping is an identity, i.e., the value of a
77  * NodeId is equal to the index of the column in the DatabaseTable.
78  * @param alloc the allocator used to allocate the structures within the
79  * Score.
80  * @warning If nodeId2columns is not empty, then only the scores over the
81  * ids belonging to this bijection can be computed: applying method
82  * score() over other ids will raise exception NotFound. */
83  Score(const DBRowGeneratorParser< ALLOC >& parser,
84  const Apriori< ALLOC >& external_apriori,
85  const std::vector< std::pair< std::size_t, std::size_t >,
86  ALLOC< std::pair< std::size_t, std::size_t > > >&
87  ranges,
88  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
90  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
92 
93 
94  /// default constructor
95  /** @param parser the parser used to parse the database
96  * @param external_apriori An apriori that we add to the computation of
97  * the score (this should come from expert knowledge)
98  * @param nodeId2Columns a mapping from the ids of the nodes in the
99  * graphical model to the corresponding column in the DatabaseTable
100  * parsed by the parser. This enables estimating from a database in
101  * which variable A corresponds to the 2nd column the parameters of a BN
102  * in which variable A has a NodeId of 5. An empty nodeId2Columns
103  * bijection means that the mapping is an identity, i.e., the value of a
104  * NodeId is equal to the index of the column in the DatabaseTable.
105  * @param alloc the allocator used to allocate the structures within the
106  * Score.
107  * @warning If nodeId2columns is not empty, then only the scores over the
108  * ids belonging to this bijection can be computed: applying method
109  * score() over other ids will raise exception NotFound. */
110  Score(const DBRowGeneratorParser< ALLOC >& parser,
111  const Apriori< ALLOC >& external_apriori,
112  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
114  = Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(),
115  const allocator_type& alloc = allocator_type());
116 
117  /// virtual copy constructor
118  virtual Score< ALLOC >* clone() const = 0;
119 
120  /// virtual copy constructor with a given allocator
121  virtual Score< ALLOC >* clone(const allocator_type& alloc) const = 0;
122 
123  /// destructor
124  virtual ~Score();
125 
126  /// @}
127 
128 
129  // ##########################################################################
130  /// @name Accessors / Modifiers
131  // ##########################################################################
132  /// @{
133 
134  /// changes the max number of threads used to parse the database
135  virtual void setMaxNbThreads(std::size_t nb) const;
136 
137  /// returns the number of threads used to parse the database
138  virtual std::size_t nbThreads() const;
139 
140  /** @brief changes the number min of rows a thread should process in a
141  * multithreading context
142  *
143  * When computing score, several threads are used by record counters to
144  * perform countings on the rows of the database, the MinNbRowsPerThread
145  * method indicates how many rows each thread should at least process.
146  * This is used to compute the number of threads actually run. This number
147  * is equal to the min between the max number of threads allowed and the
148  * number of records in the database divided by nb. */
149  virtual void setMinNbRowsPerThread(const std::size_t nb) const;
150 
151  /// returns the minimum of rows that each thread should process
152  virtual std::size_t minNbRowsPerThread() const;
153 
154  /// sets new ranges to perform the countings used by the score
155  /** @param ranges a set of pairs {(X1,Y1),...,(Xn,Yn)} of database's rows
156  * indices. The countings are then performed only on the union of the
157  * rows [Xi,Yi), i in {1,...,n}. This is useful, e.g, when performing
158  * cross validation tasks, in which part of the database should be ignored.
159  * An empty set of ranges is equivalent to an interval [X,Y) ranging over
160  * the whole database. */
161  template < template < typename > class XALLOC >
162  void setRanges(
163  const std::vector< std::pair< std::size_t, std::size_t >,
164  XALLOC< std::pair< std::size_t, std::size_t > > >&
165  new_ranges);
166 
167  /// reset the ranges to the one range corresponding to the whole database
168  void clearRanges();
169 
170  /// returns the current ranges
171  const std::vector< std::pair< std::size_t, std::size_t >,
172  ALLOC< std::pair< std::size_t, std::size_t > > >&
173  ranges() const;
174 
175  /// returns the score of a single node
176  double score(const NodeId var);
177 
178  /// returns the score of a single node given some other nodes
179  /** @param var the variable on the left side of the conditioning bar
180  * @param rhs_ids the set of variables on the right side of the
181  * conditioning bar */
182  double score(const NodeId var,
183  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids);
184 
185  /// clears all the data structures from memory, including the cache
186  void clear();
187 
188  /// clears the current cache
189  void clearCache();
190 
191  /// turn on/off the use of a cache of the previously computed score
192  void useCache(const bool on_off);
193 
194  /// indicates whether the score uses a cache
195  bool isUsingCache() const;
196 
197  /// return the mapping between the columns of the database and the node ids
198  /** @warning An empty nodeId2Columns bijection means that the mapping is
199  * an identity, i.e., the value of a NodeId is equal to the index of the
200  * column in the DatabaseTable. */
201  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
202  nodeId2Columns() const;
203 
204  /// return the database used by the score
205  const DatabaseTable< ALLOC >& database() const;
206 
207  /// indicates whether the apriori is compatible (meaningful) with the score
208  /** The combination of some scores and aprioris can be meaningless. For
209  * instance, adding a Dirichlet apriori to the K2 score is not very
210  * meaningful since K2 corresonds to a BD score with a 1-smoothing
211  * apriori.
212  * aGrUM allows you to perform such combination, but you can check with
213  * method isAprioriCompatible () whether the result the score will give
214  * you is meaningful or not. */
215  virtual std::string isAprioriCompatible() const = 0;
216 
217  /// returns the internal apriori of the score
218  /** Some scores include an apriori. For instance, the K2 score is a BD
219  * score with a Laplace Apriori ( smoothing(1) ). BDeu is a BD score with
220  * a N'/(r_i * q_i) apriori, where N' is an effective sample size and r_i
221  * is the domain size of the target variable and q_i is the domain size of
222  * the Cartesian product of its parents. The goal of the score's internal
223  * apriori classes is to enable to account for these aprioris outside the
224  * score, e.g., when performing parameter estimation. It is important to
225  * note that, to be meaningful, a structure + parameter learning requires
226  * that the same aprioris are taken into account during structure learning
227  * and parameter learning. */
228  virtual const Apriori< ALLOC >& internalApriori() const = 0;
229 
230  /// returns the allocator used by the score
232 
233  /// @}
234 
235 
236  protected:
237  /// 1 / log(2)
238  const double one_log2_{M_LOG2E};
239 
240  /// the expert knowledge a priori we add to the score
241  Apriori< ALLOC >* apriori_{nullptr};
242 
243  /// the record counter used for the countings over discrete variables
245 
246  /// the scoring cache
248 
249  /// a Boolean indicating whether we wish to use the cache
250  bool use_cache_{true};
251 
252  /// an empty vector
254 
255 
256  /// copy constructor
257  Score(const Score< ALLOC >& from);
258 
259  /// copy constructor with a given allocator
260  Score(const Score< ALLOC >& from, const allocator_type& alloc);
261 
262  /// move constructor
263  Score(Score< ALLOC >&& from);
264 
265  /// move constructor with a given allocator
266  Score(Score< ALLOC >&& from, const allocator_type& alloc);
267 
268  /// copy operator
269  Score< ALLOC >& operator=(const Score< ALLOC >& from);
270 
271  /// move operator
272  Score< ALLOC >& operator=(Score< ALLOC >&& from);
273 
274  /// returns the score for a given IdCondSet
275  /** @throws OperationNotAllowed is raised if the score does not support
276  * calling method score such an idset (due to too many/too few variables
277  * in the left hand side or the right hand side of the idset). */
278  virtual double score_(const IdCondSet< ALLOC >& idset) = 0;
279 
280  /// returns a counting vector where variables are marginalized from N_xyz
281  /** @param X_id the id of the variable to marginalize (this is the first
282  * variable in table N_xyz
283  * @param N_xyz a counting vector of dimension X * cond_vars (in this order)
284  */
285  std::vector< double, ALLOC< double > >
286  marginalize_(const NodeId X_id,
287  const std::vector< double, ALLOC< double > >& N_xyz) const;
288  };
289 
290  } /* namespace learning */
291 
292 } /* namespace gum */
293 
294 
295 #ifndef GUM_NO_EXTERN_TEMPLATE_CLASS
296 extern template class gum::learning::Score<>;
297 #endif
298 
299 
300 /// include the template implementation
301 #include <agrum/BN/learning/scores_and_tests/score_tpl.h>
302 
303 #endif /* GUM_LEARNING_SCORE_H */
const DatabaseTable< ALLOC > & database() const
return the database used by the score
double score(const NodeId var)
returns the score of a single node
void setRanges(const std::vector< std::pair< std::size_t, std::size_t >, XALLOC< std::pair< std::size_t, std::size_t > > > &new_ranges)
sets new ranges to perform the countings used by the score
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > & nodeId2Columns() const
return the mapping between the columns of the database and the node ids
Score(Score< ALLOC > &&from)
move constructor
bool use_cache_
a Boolean indicating whether we wish to use the cache
Definition: score.h:250
Score(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
virtual void setMaxNbThreads(std::size_t nb) const
changes the max number of threads used to parse the database
Score(Score< ALLOC > &&from, const allocator_type &alloc)
move constructor with a given allocator
virtual std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
Score(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > & ranges() const
returns the current ranges
virtual std::size_t nbThreads() const
returns the number of threads used to parse the database
ScoringCache< ALLOC > cache_
the scoring cache
Definition: score.h:247
std::vector< double, ALLOC< double > > marginalize_(const NodeId X_id, const std::vector< double, ALLOC< double > > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
virtual ~Score()
destructor
Score< ALLOC > & operator=(const Score< ALLOC > &from)
copy operator
void clear()
clears all the data structures from memory, including the cache
virtual const Apriori< ALLOC > & internalApriori() const =0
returns the internal apriori of the score
RecordCounter< ALLOC > counter_
the record counter used for the countings over discrete variables
Definition: score.h:244
Apriori< ALLOC > * apriori_
the expert knowledge a priori we add to the score
Definition: score.h:241
bool isUsingCache() const
indicates whether the score uses a cache
double score(const NodeId var, const std::vector< NodeId, ALLOC< NodeId > > &rhs_ids)
returns the score of a single node given some other nodes
Score< ALLOC > & operator=(Score< ALLOC > &&from)
move operator
void clearCache()
clears the current cache
virtual std::string isAprioriCompatible() const =0
indicates whether the apriori is compatible (meaningful) with the score
void useCache(const bool on_off)
turn on/off the use of a cache of the previously computed score
void clearRanges()
reset the ranges to the one range corresponding to the whole database
virtual Score< ALLOC > * clone(const allocator_type &alloc) const =0
virtual copy constructor with a given allocator
allocator_type getAllocator() const
returns the allocator used by the score
const double one_log2_
1 / log(2)
Definition: score.h:238
virtual void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context ...
virtual double score_(const IdCondSet< ALLOC > &idset)=0
returns the score for a given IdCondSet
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
virtual Score< ALLOC > * clone() const =0
virtual copy constructor
Score(const Score< ALLOC > &from, const allocator_type &alloc)
copy constructor with a given allocator
Score(const Score< ALLOC > &from)
copy constructor