aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
indepTestChi2_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the class for computing Chi2 scores
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 
30 # include <agrum/tools/stattests/idCondSet.h>
31 
32 namespace gum {
33 
34  namespace learning {
35 
36  /// default constructor
37  template < template < typename > class ALLOC >
38  INLINE IndepTestChi2< ALLOC >::IndepTestChi2(
40  const Apriori< ALLOC >& apriori,
41  const std::vector< std::pair< std::size_t, std::size_t >,
42  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
43  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
45  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
49  }
50 
51 
52  /// default constructor
53  template < template < typename > class ALLOC >
56  const Apriori< ALLOC >& apriori,
57  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
59  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
63  }
64 
65 
66  /// copy constructor with a given allocator
67  template < template < typename > class ALLOC >
69  const IndepTestChi2< ALLOC >& from,
70  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
74  }
75 
76 
77  /// copy constructor
78  template < template < typename > class ALLOC >
79  INLINE
82 
83 
84  /// move constructor with a given allocator
85  template < template < typename > class ALLOC >
88  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
92  }
93 
94 
95  /// move constructor
96  template < template < typename > class ALLOC >
99 
100 
101  /// virtual copy constructor with a given allocator
102  template < template < typename > class ALLOC >
104  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) const {
107  try {
109  } catch (...) {
111  throw;
112  }
113 
114  return new_score;
115  }
116 
117 
118  /// virtual copy constructor
119  template < template < typename > class ALLOC >
120  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone() const {
121  return clone(this->getAllocator());
122  }
123 
124 
125  /// destructor
126  template < template < typename > class ALLOC >
129  }
130 
131 
132  /// copy operator
133  template < template < typename > class ALLOC >
134  IndepTestChi2< ALLOC >&
136  if (this != &from) {
138  //__chi2 = from.chi2__;
139  }
140  return *this;
141  }
142 
143 
144  /// move operator
145  template < template < typename > class ALLOC >
146  IndepTestChi2< ALLOC >&
148  if (this != &from) {
150  //__chi2 = std::move(from.chi2__);
151  }
152  return *this;
153  }
154 
155  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
156  template < template < typename > class ALLOC >
157  std::pair< double, double > IndepTestChi2< ALLOC >::statistics(
158  NodeId var1,
159  NodeId var2,
160  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
161  return statistics_(IdCondSet< ALLOC >(var1, var2, rhs_ids, false));
162  }
163 
164  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
165  template < template < typename > class ALLOC >
166  std::pair< double, double >
168  // get the countings
169  std::vector< double, ALLOC< double > > N_xyz(
170  this->counter_.counts(idset, true));
174  const std::size_t all_size = (N_xyz.size());
175 
176  // compute the domain sizes of X and Y
177  const auto& nodeId2cols = this->counter_.nodeId2Columns();
178  const auto& database = this->counter_.database();
179  Idx var_x, var_y;
180  if (nodeId2cols.empty()) {
181  var_x = idset[0];
182  var_y = idset[1];
183  } else {
186  }
187 
190 
191  double cumulStat = 0;
192 
193  // here, we distinguish idsets with conditioning nodes from those
194  // without conditioning nodes
195  if (idset.hasConditioningSet()) {
196  const std::size_t Z_size = all_size / (X_size * Y_size);
197 
198  // get the counts for the conditioning nodes
199  std::vector< double, ALLOC< double > > N_xz
200  = this->marginalize_(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
201  std::vector< double, ALLOC< double > > N_yz
202  = this->marginalize_(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
203  std::vector< double, ALLOC< double > > N_z
204  = this->marginalize_(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
205 
206  // indicate to the chi2 distribution the set of conditioning nodes
207  std::vector< Idx > cond_nodes;
209  {
210  const auto cond_idset = idset.conditionalIdCondSet().ids();
211  if (nodeId2cols.empty()) {
212  for (const auto node: cond_idset)
214  } else {
215  for (const auto node: cond_idset)
217  }
218  }
220 
221 
222  // now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
223  // (#ZX * #ZY / #Z )
224  for (std::size_t z = std::size_t(0),
225  beg_xz = std::size_t(0),
226  beg_yz = std::size_t(0),
227  xyz = std::size_t(0);
228  z < Z_size;
229  ++z, beg_xz += X_size, beg_yz += Y_size) {
230  if (N_z[z] > 0) {
231  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size;
232  ++yz, ++y) {
233  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size;
234  ++xz, ++x, ++xyz) {
235  const double tmp1 = (N_yz[yz] * N_xz[xz]) / N_z[z];
236  if (tmp1 != 0.0) {
237  const double tmp2 = N_xyz[xyz] - tmp1;
238  cumulStat += (tmp2 * tmp2) / tmp1;
239  }
240  }
241  }
242  } else { // moving xyz out of the loops x,y when if N_z[z]==0
243  xyz += X_size * Y_size;
244  }
245  }
246  } else {
247  // here, there is no conditioning set
248 
249  // indicate to the chi2 distribution the set of conditioning nodes
251 
252  // now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
253 
254  // get the counts for all the targets and for the conditioning nodes
255  std::vector< double, ALLOC< double > > N_x
256  = this->marginalize_(std::size_t(1),
257  X_size,
258  Y_size,
259  std::size_t(1),
260  N_xyz);
261  std::vector< double, ALLOC< double > > N_y
262  = this->marginalize_(std::size_t(0),
263  X_size,
264  Y_size,
265  std::size_t(1),
266  N_xyz);
267 
268  // count N
269  double N = 0.0;
270  for (const auto n_x: N_x)
271  N += n_x;
272 
273  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
274  const double tmp_Ny = N_y[y];
275  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
276  const double tmp1 = (tmp_Ny * N_x[x]) / N;
277  if (tmp1 != 0.0) {
278  const double tmp2 = N_xyz[xy] - tmp1;
279  cumulStat += (tmp2 * tmp2) / tmp1;
280  }
281  }
282  }
283  }
284 
286  double pValue = chi2__.probaChi2(cumulStat, df);
287  return std::pair< double, double >(cumulStat, pValue);
288  }
289 
290 
291  /// returns the score corresponding to a given nodeset
292  template < template < typename > class ALLOC >
293  inline double IndepTestChi2< ALLOC >::score_(const IdCondSet< ALLOC >& idset) {
294  const auto& nodeId2cols = this->counter_.nodeId2Columns();
295  Idx var_x, var_y;
296  if (nodeId2cols.empty()) {
297  var_x = idset[0];
298  var_y = idset[1];
299  } else {
302  }
303 
304  auto stat = statistics_(idset); // stat contains pair(Chi2stat,pValue)
305  double score = stat.first;
306 
307  // ok, here, score contains the value of the chi2 formula.
308  // To get a meaningful score, we shall compute the critical values
309  // for the Chi2 distribution and assign as the score of
310  // (score - alpha ) / alpha, where alpha is the critical value
311  const double alpha = chi2__.criticalValue(var_x, var_y);
312  score = (score - alpha) / alpha;
313 
314  return score;
315  }
316 
317  } /* namespace learning */
318 
319 } /* namespace gum */
320 
321 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)