aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
indepTestG2_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief the class for computing G2 scores
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 
30 namespace gum {
31 
32  namespace learning {
33 
34  /// default constructor
35  template < template < typename > class ALLOC >
36  INLINE IndepTestG2< ALLOC >::IndepTestG2(
38  const Apriori< ALLOC >& apriori,
39  const std::vector< std::pair< std::size_t, std::size_t >,
40  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
41  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
43  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
47  }
48 
49 
50  /// default constructor
51  template < template < typename > class ALLOC >
54  const Apriori< ALLOC >& apriori,
55  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
57  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
61  }
62 
63 
64  /// copy constructor with a given allocator
65  template < template < typename > class ALLOC >
67  const IndepTestG2< ALLOC >& from,
68  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
72  }
73 
74 
75  /// copy constructor
76  template < template < typename > class ALLOC >
79 
80 
81  /// move constructor with a given allocator
82  template < template < typename > class ALLOC >
84  IndepTestG2< ALLOC >&& from,
85  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
89  }
90 
91 
92  /// move constructor
93  template < template < typename > class ALLOC >
96 
97 
98  /// virtual copy constructor with a given allocator
99  template < template < typename > class ALLOC >
101  const typename IndepTestG2< ALLOC >::allocator_type& alloc) const {
104  try {
106  } catch (...) {
108  throw;
109  }
110 
111  return new_score;
112  }
113 
114 
115  /// virtual copy constructor
116  template < template < typename > class ALLOC >
117  IndepTestG2< ALLOC >* IndepTestG2< ALLOC >::clone() const {
118  return clone(this->getAllocator());
119  }
120 
121 
122  /// destructor
123  template < template < typename > class ALLOC >
124  IndepTestG2< ALLOC >::~IndepTestG2() {
126  }
127 
128 
129  /// copy operator
130  template < template < typename > class ALLOC >
131  IndepTestG2< ALLOC >&
132  IndepTestG2< ALLOC >::operator=(const IndepTestG2< ALLOC >& from) {
133  if (this != &from) {
135  //__chi2 = from.chi2__;
136  }
137  return *this;
138  }
139 
140 
141  /// move operator
142  template < template < typename > class ALLOC >
143  IndepTestG2< ALLOC >&
145  if (this != &from) {
147  //__chi2 = std::move(from.chi2__);
148  }
149  return *this;
150  }
151 
152  /// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
153  template < template < typename > class ALLOC >
154  std::pair< double, double > IndepTestG2< ALLOC >::statistics(
155  NodeId var1,
156  NodeId var2,
157  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
158  return statistics_(IdCondSet< ALLOC >(var1, var2, rhs_ids, false));
159  }
160 
161  /// returns the score corresponding to a given nodeset
162  template < template < typename > class ALLOC >
163  std::pair< double, double >
165  // get the countings
166  std::vector< double, ALLOC< double > > N_xyz(
167  this->counter_.counts(idset, true));
171  const std::size_t all_size = (N_xyz.size());
172 
173  // compute the domain sizes of X and Y
174  const auto& nodeId2cols = this->counter_.nodeId2Columns();
175  const auto& database = this->counter_.database();
176  Idx var_x, var_y;
177  if (nodeId2cols.empty()) {
178  var_x = idset[0];
179  var_y = idset[1];
180  } else {
183  }
184 
187 
188  double cumulStat = 0.0;
189 
190  // here, we distinguish idsets with conditioning nodes from those
191  // without conditioning nodes
192  if (idset.hasConditioningSet()) {
193  const std::size_t Z_size = all_size / (X_size * Y_size);
194 
195  // get the counts for the conditioning nodes
196  std::vector< double, ALLOC< double > > N_xz
197  = this->marginalize_(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
198  std::vector< double, ALLOC< double > > N_yz
199  = this->marginalize_(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
200  std::vector< double, ALLOC< double > > N_z
201  = this->marginalize_(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
202 
203  // indicate to the chi2 distribution the set of conditioning nodes
204  std::vector< Idx > cond_nodes;
206  {
207  const auto cond_idset = idset.conditionalIdCondSet().ids();
208  if (nodeId2cols.empty()) {
209  for (const auto node: cond_idset)
211  } else {
212  for (const auto node: cond_idset)
214  }
215  }
217 
218 
219  // now, perform :
220  // sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
221  for (std::size_t z = std::size_t(0),
222  beg_xz = std::size_t(0),
223  beg_yz = std::size_t(0),
224  xyz = std::size_t(0);
225  z < Z_size;
226  ++z, beg_xz += X_size, beg_yz += Y_size) {
227  if (N_z[z] > 0) {
228  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size;
229  ++yz, ++y) {
230  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size;
231  ++xz, ++x, ++xyz) {
232  const double tmp1 = N_xyz[xyz] * N_z[z];
233  const double tmp2 = N_yz[yz] * N_xz[xz];
234  if ((tmp1 != 0.0) && (tmp2 != 0.0)) {
235  cumulStat += N_xyz[xyz] * std::log(tmp1 / tmp2);
236  }
237  }
238  }
239  } else { // moving xyz out of the loops x,y when if N_z[z]==0
240  xyz += X_size * Y_size;
241  }
242  }
243  } else {
244  // here, there is no conditioning set
245 
246  // indicate to the chi2 distribution the set of conditioning nodes
248 
249  // now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
250 
251  // get the counts for all the targets and for the conditioning nodes
252  std::vector< double, ALLOC< double > > N_x
253  = this->marginalize_(std::size_t(1),
254  X_size,
255  Y_size,
256  std::size_t(1),
257  N_xyz);
258  std::vector< double, ALLOC< double > > N_y
259  = this->marginalize_(std::size_t(0),
260  X_size,
261  Y_size,
262  std::size_t(1),
263  N_xyz);
264 
265  // count N
266  double N = 0.0;
267  for (auto n_x: N_x)
268  N += n_x;
269 
270  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
271  const double tmp_Ny = N_y[y];
272  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
273  const double tmp = (tmp_Ny * N_x[x]);
274  if ((tmp != 0.0) && (N_xyz[xy] != 0.0)) {
275  cumulStat += N_xyz[xy] * std::log((N_xyz[xy] * N) / tmp);
276  }
277  }
278  }
279  }
280 
281  // used to make the G test formula asymptotically equivalent
282  // to the Pearson's chi-squared test formula
283  cumulStat *= 2;
284 
286  double pValue = chi2__.probaChi2(cumulStat, df);
287  return std::pair< double, double >(cumulStat, pValue);
288  }
289 
290  /// returns the score corresponding to a given nodeset
291  template < template < typename > class ALLOC >
292  double IndepTestG2< ALLOC >::score_(const IdCondSet< ALLOC >& idset) {
293  // compute the domain sizes of X and Y
294  const auto& nodeId2cols = this->counter_.nodeId2Columns();
295  Idx var_x, var_y;
296  if (nodeId2cols.empty()) {
297  var_x = idset[0];
298  var_y = idset[1];
299  } else {
302  }
303 
304  auto stat = statistics_(idset);
305  double score = stat.first;
306 
307  // ok, here, score contains the value of the chi2 formula.
308  // To get a meaningful score, we shall compute the critical values
309  // for the Chi2 distribution and assign as the score of
310  // (score - alpha ) / alpha, where alpha is the critical value
311  const double alpha = chi2__.criticalValue(var_x, var_y);
312  score = (score - alpha) / alpha;
313 
314  return score;
315  }
316 
317  } /* namespace learning */
318 
319 } /* namespace gum */
320 
321 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)