aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
indepTestChi2_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the class for computing Chi2 scores
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 
30 # include <agrum/tools/stattests/idCondSet.h>
31 
32 namespace gum {
33 
34  namespace learning {
35 
36  /// default constructor
37  template < template < typename > class ALLOC >
38  INLINE IndepTestChi2< ALLOC >::IndepTestChi2(
40  const Apriori< ALLOC >& apriori,
41  const std::vector< std::pair< std::size_t, std::size_t >,
42  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
44  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
48  }
49 
50 
51  /// default constructor
52  template < template < typename > class ALLOC >
55  const Apriori< ALLOC >& apriori,
57  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
61  }
62 
63 
64  /// copy constructor with a given allocator
65  template < template < typename > class ALLOC >
67  const IndepTestChi2< ALLOC >& from,
68  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
72  }
73 
74 
75  /// copy constructor
76  template < template < typename > class ALLOC >
79 
80 
81  /// move constructor with a given allocator
82  template < template < typename > class ALLOC >
85  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
89  }
90 
91 
92  /// move constructor
93  template < template < typename > class ALLOC >
96 
97 
98  /// virtual copy constructor with a given allocator
99  template < template < typename > class ALLOC >
101  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) const {
104  try {
106  } catch (...) {
108  throw;
109  }
110 
111  return new_score;
112  }
113 
114 
115  /// virtual copy constructor
116  template < template < typename > class ALLOC >
117  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone() const {
118  return clone(this->getAllocator());
119  }
120 
121 
122  /// destructor
123  template < template < typename > class ALLOC >
126  }
127 
128 
129  /// copy operator
130  template < template < typename > class ALLOC >
132  if (this != &from) {
134  // __chi2 = from. _chi2_;
135  }
136  return *this;
137  }
138 
139 
140  /// move operator
141  template < template < typename > class ALLOC >
143  if (this != &from) {
145  // __chi2 = std::move(from. _chi2_);
146  }
147  return *this;
148  }
149 
150  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
151  template < template < typename > class ALLOC >
152  std::pair< double, double >
154  NodeId var2,
155  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
156  return statistics_(IdCondSet< ALLOC >(var1, var2, rhs_ids, false));
157  }
158 
159  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
160  template < template < typename > class ALLOC >
161  std::pair< double, double >
163  // get the countings
164  std::vector< double, ALLOC< double > > N_xyz(this->counter_.counts(idset, true));
167  const std::size_t all_size = (N_xyz.size());
168 
169  // compute the domain sizes of X and Y
170  const auto& nodeId2cols = this->counter_.nodeId2Columns();
171  const auto& database = this->counter_.database();
172  Idx var_x, var_y;
173  if (nodeId2cols.empty()) {
174  var_x = idset[0];
175  var_y = idset[1];
176  } else {
179  }
180 
183 
184  double cumulStat = 0;
185 
186  // here, we distinguish idsets with conditioning nodes from those
187  // without conditioning nodes
188  if (idset.hasConditioningSet()) {
189  const std::size_t Z_size = all_size / (X_size * Y_size);
190 
191  // get the counts for the conditioning nodes
192  std::vector< double, ALLOC< double > > N_xz
193  = this->marginalize_(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
194  std::vector< double, ALLOC< double > > N_yz
195  = this->marginalize_(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
196  std::vector< double, ALLOC< double > > N_z
197  = this->marginalize_(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
198 
199  // indicate to the chi2 distribution the set of conditioning nodes
200  std::vector< Idx > cond_nodes;
202  {
203  const auto cond_idset = idset.conditionalIdCondSet().ids();
204  if (nodeId2cols.empty()) {
205  for (const auto node: cond_idset)
207  } else {
208  for (const auto node: cond_idset)
210  }
211  }
213 
214 
215  // now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
216  // (#ZX * #ZY / #Z )
217  for (std::size_t z = std::size_t(0),
218  beg_xz = std::size_t(0),
219  beg_yz = std::size_t(0),
220  xyz = std::size_t(0);
221  z < Z_size;
222  ++z, beg_xz += X_size, beg_yz += Y_size) {
223  if (N_z[z] > 0) {
224  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size; ++yz, ++y) {
225  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size; ++xz, ++x, ++xyz) {
226  const double tmp1 = (N_yz[yz] * N_xz[xz]) / N_z[z];
227  if (tmp1 != 0.0) {
228  const double tmp2 = N_xyz[xyz] - tmp1;
229  cumulStat += (tmp2 * tmp2) / tmp1;
230  }
231  }
232  }
233  } else { // moving xyz out of the loops x,y when if N_z[z]==0
234  xyz += X_size * Y_size;
235  }
236  }
237  } else {
238  // here, there is no conditioning set
239 
240  // indicate to the chi2 distribution the set of conditioning nodes
242 
243  // now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
244 
245  // get the counts for all the targets and for the conditioning nodes
246  std::vector< double, ALLOC< double > > N_x
247  = this->marginalize_(std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
248  std::vector< double, ALLOC< double > > N_y
249  = this->marginalize_(std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
250 
251  // count N
252  double N = 0.0;
253  for (const auto n_x: N_x)
254  N += n_x;
255 
256  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
257  const double tmp_Ny = N_y[y];
258  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
259  const double tmp1 = (tmp_Ny * N_x[x]) / N;
260  if (tmp1 != 0.0) {
261  const double tmp2 = N_xyz[xy] - tmp1;
262  cumulStat += (tmp2 * tmp2) / tmp1;
263  }
264  }
265  }
266  }
267 
269  double pValue = _chi2_.probaChi2(cumulStat, df);
270  return std::pair< double, double >(cumulStat, pValue);
271  }
272 
273 
274  /// returns the score corresponding to a given nodeset
275  template < template < typename > class ALLOC >
276  inline double IndepTestChi2< ALLOC >::score_(const IdCondSet< ALLOC >& idset) {
277  const auto& nodeId2cols = this->counter_.nodeId2Columns();
278  Idx var_x, var_y;
279  if (nodeId2cols.empty()) {
280  var_x = idset[0];
281  var_y = idset[1];
282  } else {
285  }
286 
287  auto stat = statistics_(idset); // stat contains pair(Chi2stat,pValue)
288  double score = stat.first;
289 
290  // ok, here, score contains the value of the chi2 formula.
291  // To get a meaningful score, we shall compute the critical values
292  // for the Chi2 distribution and assign as the score of
293  // (score - alpha ) / alpha, where alpha is the critical value
294  const double alpha = _chi2_.criticalValue(var_x, var_y);
295  score = (score - alpha) / alpha;
296 
297  return score;
298  }
299 
300  } /* namespace learning */
301 
302 } /* namespace gum */
303 
304 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)