aGrUM  0.16.0
indepTestG2_tpl.h
Go to the documentation of this file.
1 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 
31 namespace gum {
32 
33  namespace learning {
34 
36  template < template < typename > class ALLOC >
38  const DBRowGeneratorParser< ALLOC >& parser,
39  const Apriori< ALLOC >& apriori,
40  const std::vector< std::pair< std::size_t, std::size_t >,
41  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
42  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
43  nodeId2columns,
44  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
45  IndependenceTest< ALLOC >(parser, apriori, ranges, nodeId2columns, alloc),
46  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
47  GUM_CONSTRUCTOR(IndepTestG2);
48  }
49 
50 
52  template < template < typename > class ALLOC >
54  const DBRowGeneratorParser< ALLOC >& parser,
55  const Apriori< ALLOC >& apriori,
56  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
57  nodeId2columns,
58  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
59  IndependenceTest< ALLOC >(parser, apriori, nodeId2columns, alloc),
60  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
61  GUM_CONSTRUCTOR(IndepTestG2);
62  }
63 
64 
66  template < template < typename > class ALLOC >
68  const IndepTestG2< ALLOC >& from,
69  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
70  IndependenceTest< ALLOC >(from, alloc),
71  __chi2(__domain_sizes) {
72  GUM_CONS_CPY(IndepTestG2);
73  }
74 
75 
77  template < template < typename > class ALLOC >
78  INLINE IndepTestG2< ALLOC >::IndepTestG2(const IndepTestG2< ALLOC >& from) :
79  IndepTestG2< ALLOC >(from, from.getAllocator()) {}
80 
81 
83  template < template < typename > class ALLOC >
85  IndepTestG2< ALLOC >&& from,
86  const typename IndepTestG2< ALLOC >::allocator_type& alloc) :
87  IndependenceTest< ALLOC >(std::move(from), alloc),
88  __domain_sizes(from.__domain_sizes), __chi2(__domain_sizes) {
89  GUM_CONS_MOV(IndepTestG2);
90  }
91 
92 
94  template < template < typename > class ALLOC >
95  INLINE IndepTestG2< ALLOC >::IndepTestG2(IndepTestG2< ALLOC >&& from) :
96  IndepTestG2< ALLOC >(std::move(from), from.getAllocator()) {}
97 
98 
100  template < template < typename > class ALLOC >
101  IndepTestG2< ALLOC >* IndepTestG2< ALLOC >::clone(
102  const typename IndepTestG2< ALLOC >::allocator_type& alloc) const {
103  ALLOC< IndepTestG2< ALLOC > > allocator(alloc);
104  IndepTestG2< ALLOC >* new_score = allocator.allocate(1);
105  try {
106  allocator.construct(new_score, *this, alloc);
107  } catch (...) {
108  allocator.deallocate(new_score, 1);
109  throw;
110  }
111 
112  return new_score;
113  }
114 
115 
117  template < template < typename > class ALLOC >
118  IndepTestG2< ALLOC >* IndepTestG2< ALLOC >::clone() const {
119  return clone(this->getAllocator());
120  }
121 
122 
124  template < template < typename > class ALLOC >
126  GUM_DESTRUCTOR(IndepTestG2);
127  }
128 
129 
131  template < template < typename > class ALLOC >
132  IndepTestG2< ALLOC >& IndepTestG2< ALLOC >::
133  operator=(const IndepTestG2< ALLOC >& from) {
134  if (this != &from) {
136  //__chi2 = from.__chi2;
137  }
138  return *this;
139  }
140 
141 
143  template < template < typename > class ALLOC >
144  IndepTestG2< ALLOC >& IndepTestG2< ALLOC >::
145  operator=(IndepTestG2< ALLOC >&& from) {
146  if (this != &from) {
147  IndependenceTest< ALLOC >::operator=(std::move(from));
148  //__chi2 = std::move(from.__chi2);
149  }
150  return *this;
151  }
152 
154  template < template < typename > class ALLOC >
155  std::pair< double, double > IndepTestG2< ALLOC >::statistics(
156  NodeId var1,
157  NodeId var2,
158  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
159  return _statistics(IdSet< ALLOC >(var1, var2, rhs_ids, false));
160  }
161 
163  template < template < typename > class ALLOC >
164  std::pair< double, double >
165  IndepTestG2< ALLOC >::_statistics(const IdSet< ALLOC >& idset) {
166  // get the countings
167  std::vector< double, ALLOC< double > > N_xyz(
168  this->_counter.counts(idset, true));
169  const bool informative_external_apriori = this->_apriori->isInformative();
170  if (informative_external_apriori)
171  this->_apriori->addAllApriori(idset, N_xyz);
172  const std::size_t all_size = (N_xyz.size());
173 
174  // compute the domain sizes of X and Y
175  const auto& nodeId2cols = this->_counter.nodeId2Columns();
176  const auto& database = this->_counter.database();
177  Idx var_x, var_y;
178  if (nodeId2cols.empty()) {
179  var_x = idset[0];
180  var_y = idset[1];
181  } else {
182  var_x = nodeId2cols.second(idset[0]);
183  var_y = nodeId2cols.second(idset[1]);
184  }
185 
186  const std::size_t X_size = database.domainSize(var_x);
187  const std::size_t Y_size = database.domainSize(var_y);
188 
189  double cumulStat = 0.0;
190 
191  // here, we distinguish idsets with conditioning nodes from those
192  // without conditioning nodes
193  if (idset.hasConditioningSet()) {
194  const std::size_t Z_size = all_size / (X_size * Y_size);
195 
196  // get the counts for the conditioning nodes
197  std::vector< double, ALLOC< double > > N_xz =
198  this->_marginalize(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
199  std::vector< double, ALLOC< double > > N_yz =
200  this->_marginalize(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
201  std::vector< double, ALLOC< double > > N_z =
202  this->_marginalize(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
203 
204  // indicate to the chi2 distribution the set of conditioning nodes
205  std::vector< Idx > cond_nodes;
206  cond_nodes.reserve(idset.nbRHSIds());
207  {
208  const auto cond_idset = idset.conditionalIdSet().ids();
209  if (nodeId2cols.empty()) {
210  for (const auto node : cond_idset)
211  cond_nodes.push_back(node);
212  } else {
213  for (const auto node : cond_idset)
214  cond_nodes.push_back(nodeId2cols.second(node));
215  }
216  }
217  __chi2.setConditioningNodes(cond_nodes);
218 
219 
220  // now, perform :
221  // sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
222  for (std::size_t z = std::size_t(0),
223  beg_xz = std::size_t(0),
224  beg_yz = std::size_t(0),
225  xyz = std::size_t(0);
226  z < Z_size;
227  ++z, beg_xz += X_size, beg_yz += Y_size) {
228  if (N_z[z] > 0) {
229  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size;
230  ++yz, ++y) {
231  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size;
232  ++xz, ++x, ++xyz) {
233  const double tmp1 = N_xyz[xyz] * N_z[z];
234  const double tmp2 = N_yz[yz] * N_xz[xz];
235  if ((tmp1 != 0.0) && (tmp2 != 0.0)) {
236  cumulStat += N_xyz[xyz] * std::log(tmp1 / tmp2);
237  }
238  }
239  }
240  } else { // moving xyz out of the loops x,y when if N_z[z]==0
241  xyz += X_size * Y_size;
242  }
243  }
244  } else {
245  // here, there is no conditioning set
246 
247  // indicate to the chi2 distribution the set of conditioning nodes
248  __chi2.setConditioningNodes(__empty_set);
249 
250  // now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
251 
252  // get the counts for all the targets and for the conditioning nodes
253  std::vector< double, ALLOC< double > > N_x = this->_marginalize(
254  std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
255  std::vector< double, ALLOC< double > > N_y = this->_marginalize(
256  std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
257 
258  // count N
259  double N = 0.0;
260  for (auto n_x : N_x)
261  N += n_x;
262 
263  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
264  const double tmp_Ny = N_y[y];
265  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
266  const double tmp = (tmp_Ny * N_x[x]);
267  if ((tmp != 0.0) && (N_xyz[xy] != 0.0)) {
268  cumulStat += N_xyz[xy] * std::log((N_xyz[xy] * N) / tmp);
269  }
270  }
271  }
272  }
273 
274  // used to make the G test formula asymptotically equivalent
275  // to the Pearson's chi-squared test formula
276  cumulStat *= 2;
277 
278  Size df = __chi2.degreesOfFreedom(var_x, var_y);
279  double pValue = __chi2.probaChi2(cumulStat, df);
280  return std::pair< double, double >(cumulStat, pValue);
281  }
282 
284  template < template < typename > class ALLOC >
285  double IndepTestG2< ALLOC >::_score(const IdSet< ALLOC >& idset) {
286  // compute the domain sizes of X and Y
287  const auto& nodeId2cols = this->_counter.nodeId2Columns();
288  Idx var_x, var_y;
289  if (nodeId2cols.empty()) {
290  var_x = idset[0];
291  var_y = idset[1];
292  } else {
293  var_x = nodeId2cols.second(idset[0]);
294  var_y = nodeId2cols.second(idset[1]);
295  }
296 
297  auto stat = _statistics(idset);
298  double score = stat.first;
299 
300  // ok, here, score contains the value of the chi2 formula.
301  // To get a meaningful score, we shall compute the critical values
302  // for the Chi2 distribution and assign as the score of
303  // (score - alpha ) / alpha, where alpha is the critical value
304  const double alpha = __chi2.criticalValue(var_x, var_y);
305  score = (score - alpha) / alpha;
306 
307  return score;
308  }
309 
310  } /* namespace learning */
311 
312 } /* namespace gum */
313 
314 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
ALLOC< NodeId > allocator_type
type for the allocators passed in arguments of methods
Definition: indepTestG2.h:51
std::vector< double, ALLOC< double > > _marginalize(const std::size_t node_2_marginalize, const std::size_t X_size, const std::size_t Y_size, const std::size_t Z_size, const std::vector< double, ALLOC< double > > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
virtual IndepTestG2< ALLOC > * clone() const
virtual copy constructor
RecordCounter< ALLOC > _counter
the record counter used for the countings over discrete variables
IndepTestG2(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
STL namespace.
const DatabaseTable< ALLOC > & database() const
return the database used by the score
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
Definition: agrum.h:25
IndependenceTest(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
allocator_type getAllocator() const
returns the allocator used by the score
std::pair< double, double > statistics(NodeId var1, NodeId var2, const std::vector< NodeId, ALLOC< NodeId > > &rhs_ids={})
get the pair <G2statistic,pvalue> for a test var1 indep var2 given rhs_ids
IndependenceTest< ALLOC > & operator=(const IndependenceTest< ALLOC > &from)
copy operator
double score(const NodeId var1, const NodeId var2)
returns the score of a pair of nodes
virtual ~IndepTestG2()
destructor
IndepTestG2< ALLOC > & operator=(const IndepTestG2< ALLOC > &from)
copy operator
virtual double _score(const IdSet< ALLOC > &idset) final
returns the score for a given IdSet
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Definition: types.h:48
Size NodeId
Type for node ids.
Definition: graphElements.h:98
Apriori< ALLOC > * _apriori
the expert knowledge a priori we add to the contongency tables
std::pair< double, double > _statistics(const IdSet< ALLOC > &idset)
compute the pair <G2 statistic,pvalue>