aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
indepTestG2_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the class for computing G2 scores
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 
30 namespace gum {
31 
32  namespace learning {
33 
34  /// default constructor
35  template < template < typename > class ALLOC >
36  INLINE IndepTestG2< ALLOC >::IndepTestG2(
38  const Apriori< ALLOC >& apriori,
39  const std::vector< std::pair< std::size_t, std::size_t >,
40  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
42  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
46  }
47 
48 
49  /// default constructor
50  template < template < typename > class ALLOC >
53  const Apriori< ALLOC >& apriori,
55  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
59  }
60 
61 
62  /// copy constructor with a given allocator
63  template < template < typename > class ALLOC >
65  const IndepTestG2< ALLOC >& from,
66  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
70  }
71 
72 
73  /// copy constructor
74  template < template < typename > class ALLOC >
77 
78 
79  /// move constructor with a given allocator
80  template < template < typename > class ALLOC >
82  IndepTestG2< ALLOC >&& from,
83  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
87  }
88 
89 
90  /// move constructor
91  template < template < typename > class ALLOC >
94 
95 
96  /// virtual copy constructor with a given allocator
97  template < template < typename > class ALLOC >
99  const typename IndepTestG2< ALLOC >::allocator_type& alloc) const {
102  try {
104  } catch (...) {
106  throw;
107  }
108 
109  return new_score;
110  }
111 
112 
113  /// virtual copy constructor
114  template < template < typename > class ALLOC >
115  IndepTestG2< ALLOC >* IndepTestG2< ALLOC >::clone() const {
116  return clone(this->getAllocator());
117  }
118 
119 
120  /// destructor
121  template < template < typename > class ALLOC >
122  IndepTestG2< ALLOC >::~IndepTestG2() {
124  }
125 
126 
127  /// copy operator
128  template < template < typename > class ALLOC >
130  if (this != &from) {
132  // __chi2 = from. _chi2_;
133  }
134  return *this;
135  }
136 
137 
138  /// move operator
139  template < template < typename > class ALLOC >
141  if (this != &from) {
143  // __chi2 = std::move(from. _chi2_);
144  }
145  return *this;
146  }
147 
148  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
149  template < template < typename > class ALLOC >
150  std::pair< double, double >
152  NodeId var2,
153  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
154  return statistics_(IdCondSet< ALLOC >(var1, var2, rhs_ids, false));
155  }
156 
157  /// returns the score corresponding to a given nodeset
158  template < template < typename > class ALLOC >
159  std::pair< double, double > IndepTestG2< ALLOC >::statistics_(const IdCondSet< ALLOC >& idset) {
160  // get the countings
161  std::vector< double, ALLOC< double > > N_xyz(this->counter_.counts(idset, true));
164  const std::size_t all_size = (N_xyz.size());
165 
166  // compute the domain sizes of X and Y
167  const auto& nodeId2cols = this->counter_.nodeId2Columns();
168  const auto& database = this->counter_.database();
169  Idx var_x, var_y;
170  if (nodeId2cols.empty()) {
171  var_x = idset[0];
172  var_y = idset[1];
173  } else {
176  }
177 
180 
181  double cumulStat = 0.0;
182 
183  // here, we distinguish idsets with conditioning nodes from those
184  // without conditioning nodes
185  if (idset.hasConditioningSet()) {
186  const std::size_t Z_size = all_size / (X_size * Y_size);
187 
188  // get the counts for the conditioning nodes
189  std::vector< double, ALLOC< double > > N_xz
190  = this->marginalize_(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
191  std::vector< double, ALLOC< double > > N_yz
192  = this->marginalize_(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
193  std::vector< double, ALLOC< double > > N_z
194  = this->marginalize_(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
195 
196  // indicate to the chi2 distribution the set of conditioning nodes
197  std::vector< Idx > cond_nodes;
199  {
200  const auto cond_idset = idset.conditionalIdCondSet().ids();
201  if (nodeId2cols.empty()) {
202  for (const auto node: cond_idset)
204  } else {
205  for (const auto node: cond_idset)
207  }
208  }
210 
211 
212  // now, perform :
213  // sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
214  for (std::size_t z = std::size_t(0),
215  beg_xz = std::size_t(0),
216  beg_yz = std::size_t(0),
217  xyz = std::size_t(0);
218  z < Z_size;
219  ++z, beg_xz += X_size, beg_yz += Y_size) {
220  if (N_z[z] > 0) {
221  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size; ++yz, ++y) {
222  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size; ++xz, ++x, ++xyz) {
223  const double tmp1 = N_xyz[xyz] * N_z[z];
224  const double tmp2 = N_yz[yz] * N_xz[xz];
225  if ((tmp1 != 0.0) && (tmp2 != 0.0)) {
226  cumulStat += N_xyz[xyz] * std::log(tmp1 / tmp2);
227  }
228  }
229  }
230  } else { // moving xyz out of the loops x,y when if N_z[z]==0
231  xyz += X_size * Y_size;
232  }
233  }
234  } else {
235  // here, there is no conditioning set
236 
237  // indicate to the chi2 distribution the set of conditioning nodes
239 
240  // now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
241 
242  // get the counts for all the targets and for the conditioning nodes
243  std::vector< double, ALLOC< double > > N_x
244  = this->marginalize_(std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
245  std::vector< double, ALLOC< double > > N_y
246  = this->marginalize_(std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
247 
248  // count N
249  double N = 0.0;
250  for (auto n_x: N_x)
251  N += n_x;
252 
253  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
254  const double tmp_Ny = N_y[y];
255  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
256  const double tmp = (tmp_Ny * N_x[x]);
257  if ((tmp != 0.0) && (N_xyz[xy] != 0.0)) {
258  cumulStat += N_xyz[xy] * std::log((N_xyz[xy] * N) / tmp);
259  }
260  }
261  }
262  }
263 
264  // used to make the G test formula asymptotically equivalent
265  // to the Pearson's chi-squared test formula
266  cumulStat *= 2;
267 
269  double pValue = _chi2_.probaChi2(cumulStat, df);
270  return std::pair< double, double >(cumulStat, pValue);
271  }
272 
273  /// returns the score corresponding to a given nodeset
274  template < template < typename > class ALLOC >
275  double IndepTestG2< ALLOC >::score_(const IdCondSet< ALLOC >& idset) {
276  // compute the domain sizes of X and Y
277  const auto& nodeId2cols = this->counter_.nodeId2Columns();
278  Idx var_x, var_y;
279  if (nodeId2cols.empty()) {
280  var_x = idset[0];
281  var_y = idset[1];
282  } else {
285  }
286 
287  auto stat = statistics_(idset);
288  double score = stat.first;
289 
290  // ok, here, score contains the value of the chi2 formula.
291  // To get a meaningful score, we shall compute the critical values
292  // for the Chi2 distribution and assign as the score of
293  // (score - alpha ) / alpha, where alpha is the critical value
294  const double alpha = _chi2_.criticalValue(var_x, var_y);
295  score = (score - alpha) / alpha;
296 
297  return score;
298  }
299 
300  } /* namespace learning */
301 
302 } /* namespace gum */
303 
304 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)