aGrUM  0.16.0
indepTestChi2_tpl.h
Go to the documentation of this file.
1 
29 #ifndef DOXYGEN_SHOULD_SKIP_THIS
30 
32 
33 namespace gum {
34 
35  namespace learning {
36 
38  template < template < typename > class ALLOC >
40  const DBRowGeneratorParser< ALLOC >& parser,
41  const Apriori< ALLOC >& apriori,
42  const std::vector< std::pair< std::size_t, std::size_t >,
43  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
44  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
45  nodeId2columns,
46  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
47  IndependenceTest< ALLOC >(parser, apriori, ranges, nodeId2columns, alloc),
48  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
49  GUM_CONSTRUCTOR(IndepTestChi2);
50  }
51 
52 
54  template < template < typename > class ALLOC >
56  const DBRowGeneratorParser< ALLOC >& parser,
57  const Apriori< ALLOC >& apriori,
58  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
59  nodeId2columns,
60  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
61  IndependenceTest< ALLOC >(parser, apriori, nodeId2columns, alloc),
62  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
63  GUM_CONSTRUCTOR(IndepTestChi2);
64  }
65 
66 
68  template < template < typename > class ALLOC >
70  const IndepTestChi2< ALLOC >& from,
71  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
72  IndependenceTest< ALLOC >(from, alloc),
73  __domain_sizes(from.__domain_sizes), __chi2(__domain_sizes) {
74  GUM_CONS_CPY(IndepTestChi2);
75  }
76 
77 
79  template < template < typename > class ALLOC >
80  INLINE
81  IndepTestChi2< ALLOC >::IndepTestChi2(const IndepTestChi2< ALLOC >& from) :
82  IndepTestChi2< ALLOC >(from, from.getAllocator()) {}
83 
84 
86  template < template < typename > class ALLOC >
88  IndepTestChi2< ALLOC >&& from,
89  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
90  IndependenceTest< ALLOC >(std::move(from), alloc),
91  __domain_sizes(from.__domain_sizes), __chi2(__domain_sizes) {
92  GUM_CONS_MOV(IndepTestChi2);
93  }
94 
95 
97  template < template < typename > class ALLOC >
98  INLINE IndepTestChi2< ALLOC >::IndepTestChi2(IndepTestChi2< ALLOC >&& from) :
99  IndepTestChi2< ALLOC >(std::move(from), from.getAllocator()) {}
100 
101 
103  template < template < typename > class ALLOC >
104  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone(
105  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) const {
106  ALLOC< IndepTestChi2< ALLOC > > allocator(alloc);
107  IndepTestChi2< ALLOC >* new_score = allocator.allocate(1);
108  try {
109  allocator.construct(new_score, *this, alloc);
110  } catch (...) {
111  allocator.deallocate(new_score, 1);
112  throw;
113  }
114 
115  return new_score;
116  }
117 
118 
120  template < template < typename > class ALLOC >
121  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone() const {
122  return clone(this->getAllocator());
123  }
124 
125 
127  template < template < typename > class ALLOC >
129  GUM_DESTRUCTOR(IndepTestChi2);
130  }
131 
132 
134  template < template < typename > class ALLOC >
135  IndepTestChi2< ALLOC >& IndepTestChi2< ALLOC >::
136  operator=(const IndepTestChi2< ALLOC >& from) {
137  if (this != &from) {
139  //__chi2 = from.__chi2;
140  }
141  return *this;
142  }
143 
144 
146  template < template < typename > class ALLOC >
147  IndepTestChi2< ALLOC >& IndepTestChi2< ALLOC >::
148  operator=(IndepTestChi2< ALLOC >&& from) {
149  if (this != &from) {
150  IndependenceTest< ALLOC >::operator=(std::move(from));
151  //__chi2 = std::move(from.__chi2);
152  }
153  return *this;
154  }
155 
157  template < template < typename > class ALLOC >
158  std::pair< double, double > IndepTestChi2< ALLOC >::statistics(
159  NodeId var1,
160  NodeId var2,
161  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
162  return _statistics(IdSet< ALLOC >(var1, var2, rhs_ids, false));
163  }
164 
166  template < template < typename > class ALLOC >
167  std::pair< double, double >
168  IndepTestChi2< ALLOC >::_statistics(const IdSet< ALLOC >& idset) {
169  // get the countings
170  std::vector< double, ALLOC< double > > N_xyz(
171  this->_counter.counts(idset, true));
172  const bool informative_external_apriori = this->_apriori->isInformative();
173  if (informative_external_apriori)
174  this->_apriori->addAllApriori(idset, N_xyz);
175  const std::size_t all_size = (N_xyz.size());
176 
177  // compute the domain sizes of X and Y
178  const auto& nodeId2cols = this->_counter.nodeId2Columns();
179  const auto& database = this->_counter.database();
180  Idx var_x, var_y;
181  if (nodeId2cols.empty()) {
182  var_x = idset[0];
183  var_y = idset[1];
184  } else {
185  var_x = nodeId2cols.second(idset[0]);
186  var_y = nodeId2cols.second(idset[1]);
187  }
188 
189  const std::size_t X_size = database.domainSize(var_x);
190  const std::size_t Y_size = database.domainSize(var_y);
191 
192  double cumulStat = 0;
193 
194  // here, we distinguish idsets with conditioning nodes from those
195  // without conditioning nodes
196  if (idset.hasConditioningSet()) {
197  const std::size_t Z_size = all_size / (X_size * Y_size);
198 
199  // get the counts for the conditioning nodes
200  std::vector< double, ALLOC< double > > N_xz =
201  this->_marginalize(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
202  std::vector< double, ALLOC< double > > N_yz =
203  this->_marginalize(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
204  std::vector< double, ALLOC< double > > N_z =
205  this->_marginalize(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
206 
207  // indicate to the chi2 distribution the set of conditioning nodes
208  std::vector< Idx > cond_nodes;
209  cond_nodes.reserve(idset.nbRHSIds());
210  {
211  const auto cond_idset = idset.conditionalIdSet().ids();
212  if (nodeId2cols.empty()) {
213  for (const auto node : cond_idset)
214  cond_nodes.push_back(node);
215  } else {
216  for (const auto node : cond_idset)
217  cond_nodes.push_back(nodeId2cols.second(node));
218  }
219  }
220  __chi2.setConditioningNodes(cond_nodes);
221 
222 
223  // now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
224  // (#ZX * #ZY / #Z )
225  for (std::size_t z = std::size_t(0),
226  beg_xz = std::size_t(0),
227  beg_yz = std::size_t(0),
228  xyz = std::size_t(0);
229  z < Z_size;
230  ++z, beg_xz += X_size, beg_yz += Y_size) {
231  if (N_z[z] > 0) {
232  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size;
233  ++yz, ++y) {
234  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size;
235  ++xz, ++x, ++xyz) {
236  const double tmp1 = (N_yz[yz] * N_xz[xz]) / N_z[z];
237  if (tmp1 != 0.0) {
238  const double tmp2 = N_xyz[xyz] - tmp1;
239  cumulStat += (tmp2 * tmp2) / tmp1;
240  }
241  }
242  }
243  } else { // moving xyz out of the loops x,y when if N_z[z]==0
244  xyz += X_size * Y_size;
245  }
246  }
247  } else {
248  // here, there is no conditioning set
249 
250  // indicate to the chi2 distribution the set of conditioning nodes
251  __chi2.setConditioningNodes(__empty_set);
252 
253  // now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
254 
255  // get the counts for all the targets and for the conditioning nodes
256  std::vector< double, ALLOC< double > > N_x = this->_marginalize(
257  std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
258  std::vector< double, ALLOC< double > > N_y = this->_marginalize(
259  std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
260 
261  // count N
262  double N = 0.0;
263  for (const auto n_x : N_x)
264  N += n_x;
265 
266  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
267  const double tmp_Ny = N_y[y];
268  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
269  const double tmp1 = (tmp_Ny * N_x[x]) / N;
270  if (tmp1 != 0.0) {
271  const double tmp2 = N_xyz[xy] - tmp1;
272  cumulStat += (tmp2 * tmp2) / tmp1;
273  }
274  }
275  }
276  }
277 
278  Size df = __chi2.degreesOfFreedom(var_x, var_y);
279  double pValue = __chi2.probaChi2(cumulStat, df);
280  return std::pair< double, double >(cumulStat, pValue);
281  }
282 
283 
285  template < template < typename > class ALLOC >
286  inline double IndepTestChi2< ALLOC >::_score(const IdSet< ALLOC >& idset) {
287  const auto& nodeId2cols = this->_counter.nodeId2Columns();
288  Idx var_x, var_y;
289  if (nodeId2cols.empty()) {
290  var_x = idset[0];
291  var_y = idset[1];
292  } else {
293  var_x = nodeId2cols.second(idset[0]);
294  var_y = nodeId2cols.second(idset[1]);
295  }
296 
297  auto stat = _statistics(idset); // stat contains pair(Chi2stat,pValue)
298  double score = stat.first;
299 
300  // ok, here, score contains the value of the chi2 formula.
301  // To get a meaningful score, we shall compute the critical values
302  // for the Chi2 distribution and assign as the score of
303  // (score - alpha ) / alpha, where alpha is the critical value
304  const double alpha = __chi2.criticalValue(var_x, var_y);
305  score = (score - alpha) / alpha;
306 
307  return score;
308  }
309 
310  } /* namespace learning */
311 
312 } /* namespace gum */
313 
314 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
std::pair< double, double > _statistics(const IdSet< ALLOC > &idset)
compute the pair <chi2 statistic,pvalue>
virtual IndepTestChi2< ALLOC > * clone() const
virtual copy constructor
ALLOC< NodeId > allocator_type
type for the allocators passed in arguments of methods
Definition: indepTestChi2.h:51
IndepTestChi2(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
std::vector< double, ALLOC< double > > _marginalize(const std::size_t node_2_marginalize, const std::size_t X_size, const std::size_t Y_size, const std::size_t Z_size, const std::vector< double, ALLOC< double > > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
IndepTestChi2< ALLOC > & operator=(const IndepTestChi2< ALLOC > &from)
copy operator
RecordCounter< ALLOC > _counter
the record counter used for the countings over discrete variables
STL namespace.
const DatabaseTable< ALLOC > & database() const
return the database used by the score
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
Definition: agrum.h:25
std::pair< double, double > statistics(NodeId var1, NodeId var2, const std::vector< NodeId, ALLOC< NodeId > > &rhs_ids={})
get the pair <chi2 statistic,pvalue> for a test var1 indep var2 given rhs_ids
IndependenceTest(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
allocator_type getAllocator() const
returns the allocator used by the score
IndependenceTest< ALLOC > & operator=(const IndependenceTest< ALLOC > &from)
copy operator
double score(const NodeId var1, const NodeId var2)
returns the score of a pair of nodes
virtual double _score(const IdSet< ALLOC > &idset) final
returns the score for a given IdSet
virtual ~IndepTestChi2()
destructor
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Definition: types.h:48
Size NodeId
Type for node ids.
Definition: graphElements.h:98
Apriori< ALLOC > * _apriori
the expert knowledge a priori we add to the contongency tables