aGrUM  0.14.2
indepTestChi2_tpl.h
Go to the documentation of this file.
1 /***************************************************************************
2  * Copyright (C) 2005 by Christophe GONZALES and Pierre-Henri WUILLEMIN *
3  * {prenom.nom}_at_lip6.fr *
4  * *
5  * This program is free software; you can redistribute it and/or modify *
6  * it under the terms of the GNU General Public License as published by *
7  * the Free Software Foundation; either version 2 of the License, or *
8  * (at your option) any later version. *
9  * *
10  * This program is distributed in the hope that it will be useful, *
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13  * GNU General Public License for more details. *
14  * *
15  * You should have received a copy of the GNU General Public License *
16  * along with this program; if not, write to the *
17  * Free Software Foundation, Inc., *
18  * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19  ***************************************************************************/
26 #ifndef DOXYGEN_SHOULD_SKIP_THIS
28 
29 namespace gum {
30 
31  namespace learning {
32 
34  template < template < typename > class ALLOC >
36  const DBRowGeneratorParser< ALLOC >& parser,
37  const Apriori< ALLOC >& apriori,
38  const std::vector< std::pair< std::size_t, std::size_t >,
39  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
40  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
41  nodeId2columns,
42  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
43  IndependenceTest< ALLOC >(parser, apriori, ranges, nodeId2columns, alloc),
44  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
45  GUM_CONSTRUCTOR(IndepTestChi2);
46  }
47 
48 
50  template < template < typename > class ALLOC >
52  const DBRowGeneratorParser< ALLOC >& parser,
53  const Apriori< ALLOC >& apriori,
54  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
55  nodeId2columns,
56  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
57  IndependenceTest< ALLOC >(parser, apriori, nodeId2columns, alloc),
58  __domain_sizes(parser.database().domainSizes()), __chi2(__domain_sizes) {
59  GUM_CONSTRUCTOR(IndepTestChi2);
60  }
61 
62 
64  template < template < typename > class ALLOC >
66  const IndepTestChi2< ALLOC >& from,
67  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
68  IndependenceTest< ALLOC >(from, alloc),
69  __domain_sizes(from.__domain_sizes), __chi2(__domain_sizes) {
70  GUM_CONS_CPY(IndepTestChi2);
71  }
72 
73 
75  template < template < typename > class ALLOC >
76  INLINE
77  IndepTestChi2< ALLOC >::IndepTestChi2(const IndepTestChi2< ALLOC >& from) :
78  IndepTestChi2< ALLOC >(from, from.getAllocator()) {}
79 
80 
82  template < template < typename > class ALLOC >
84  IndepTestChi2< ALLOC >&& from,
85  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) :
86  IndependenceTest< ALLOC >(std::move(from), alloc),
87  __domain_sizes(from.__domain_sizes), __chi2(__domain_sizes) {
88  GUM_CONS_MOV(IndepTestChi2);
89  }
90 
91 
93  template < template < typename > class ALLOC >
94  INLINE IndepTestChi2< ALLOC >::IndepTestChi2(IndepTestChi2< ALLOC >&& from) :
95  IndepTestChi2< ALLOC >(std::move(from), from.getAllocator()) {}
96 
97 
99  template < template < typename > class ALLOC >
100  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone(
101  const typename IndepTestChi2< ALLOC >::allocator_type& alloc) const {
102  ALLOC< IndepTestChi2< ALLOC > > allocator(alloc);
103  IndepTestChi2< ALLOC >* new_score = allocator.allocate(1);
104  try {
105  allocator.construct(new_score, *this, alloc);
106  } catch (...) {
107  allocator.deallocate(new_score, 1);
108  throw;
109  }
110 
111  return new_score;
112  }
113 
114 
116  template < template < typename > class ALLOC >
117  IndepTestChi2< ALLOC >* IndepTestChi2< ALLOC >::clone() const {
118  return clone(this->getAllocator());
119  }
120 
121 
123  template < template < typename > class ALLOC >
125  GUM_DESTRUCTOR(IndepTestChi2);
126  }
127 
128 
130  template < template < typename > class ALLOC >
131  IndepTestChi2< ALLOC >& IndepTestChi2< ALLOC >::
132  operator=(const IndepTestChi2< ALLOC >& from) {
133  if (this != &from) {
135  //__chi2 = from.__chi2;
136  }
137  return *this;
138  }
139 
140 
142  template < template < typename > class ALLOC >
143  IndepTestChi2< ALLOC >& IndepTestChi2< ALLOC >::
144  operator=(IndepTestChi2< ALLOC >&& from) {
145  if (this != &from) {
146  IndependenceTest< ALLOC >::operator=(std::move(from));
147  //__chi2 = std::move(from.__chi2);
148  }
149  return *this;
150  }
151 
153  template < template < typename > class ALLOC >
154  std::pair< double, double > IndepTestChi2< ALLOC >::statistics(
155  NodeId var1,
156  NodeId var2,
157  const std::vector< NodeId, ALLOC< NodeId > >& rhs_ids) {
158  return _statistics(IdSet< ALLOC >(var1, var2, rhs_ids, false));
159  }
160 
162  template < template < typename > class ALLOC >
163  std::pair< double, double >
164  IndepTestChi2< ALLOC >::_statistics(const IdSet< ALLOC >& idset) {
165  // get the countings
166  std::vector< double, ALLOC< double > > N_xyz(
167  this->_counter.counts(idset, true));
168  const bool informative_external_apriori = this->_apriori->isInformative();
169  if (informative_external_apriori)
170  this->_apriori->addAllApriori(idset, N_xyz);
171  const std::size_t all_size = (N_xyz.size());
172 
173  // compute the domain sizes of X and Y
174  const auto& nodeId2cols = this->_counter.nodeId2Columns();
175  const auto& database = this->_counter.database();
176  Idx var_x, var_y;
177  if (nodeId2cols.empty()) {
178  var_x = idset[0];
179  var_y = idset[1];
180  } else {
181  var_x = nodeId2cols.second(idset[0]);
182  var_y = nodeId2cols.second(idset[1]);
183  }
184 
185  const std::size_t X_size = database.domainSize(var_x);
186  const std::size_t Y_size = database.domainSize(var_y);
187 
188  double cumulStat = 0;
189 
190  // here, we distinguish idsets with conditioning nodes from those
191  // without conditioning nodes
192  if (idset.hasConditioningSet()) {
193  const std::size_t Z_size = all_size / (X_size * Y_size);
194 
195  // get the counts for the conditioning nodes
196  std::vector< double, ALLOC< double > > N_xz =
197  this->_marginalize(std::size_t(1), X_size, Y_size, Z_size, N_xyz);
198  std::vector< double, ALLOC< double > > N_yz =
199  this->_marginalize(std::size_t(0), X_size, Y_size, Z_size, N_xyz);
200  std::vector< double, ALLOC< double > > N_z =
201  this->_marginalize(std::size_t(2), X_size, Y_size, Z_size, N_xyz);
202 
203  // indicate to the chi2 distribution the set of conditioning nodes
204  std::vector< Idx > cond_nodes;
205  cond_nodes.reserve(idset.nbRHSIds());
206  {
207  const auto cond_idset = idset.conditionalIdSet().ids();
208  if (nodeId2cols.empty()) {
209  for (const auto node : cond_idset)
210  cond_nodes.push_back(node);
211  } else {
212  for (const auto node : cond_idset)
213  cond_nodes.push_back(nodeId2cols.second(node));
214  }
215  }
216  __chi2.setConditioningNodes(cond_nodes);
217 
218 
219  // now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
220  // (#ZX * #ZY / #Z )
221 
222  for (std::size_t z = std::size_t(0),
223  beg_xz = std::size_t(0),
224  beg_yz = std::size_t(0),
225  xyz = std::size_t(0);
226  z < Z_size;
227  ++z, beg_xz += X_size, beg_yz += Y_size) {
228  if (N_z[z]) {
229  for (std::size_t y = std::size_t(0), yz = beg_yz; y < Y_size;
230  ++yz, ++y) {
231  for (std::size_t x = std::size_t(0), xz = beg_xz; x < X_size;
232  ++xz, ++x, ++xyz) {
233  const double tmp1 = (N_yz[yz] * N_xz[xz]) / N_z[z];
234  if (tmp1) {
235  const double tmp2 = N_xyz[xyz] - tmp1;
236  cumulStat += (tmp2 * tmp2) / tmp1;
237  }
238  }
239  }
240  }
241  }
242  } else {
243  // here, there is no conditioning set
244 
245  // indicate to the chi2 distribution the set of conditioning nodes
246  __chi2.setConditioningNodes(__empty_set);
247 
248  // now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
249 
250  // get the counts for all the targets and for the conditioning nodes
251  std::vector< double, ALLOC< double > > N_x = this->_marginalize(
252  std::size_t(1), X_size, Y_size, std::size_t(1), N_xyz);
253  std::vector< double, ALLOC< double > > N_y = this->_marginalize(
254  std::size_t(0), X_size, Y_size, std::size_t(1), N_xyz);
255 
256  // count N
257  double N = 0.0;
258  for (const auto n_x : N_x)
259  N += n_x;
260 
261  for (std::size_t y = std::size_t(0), xy = 0; y < Y_size; ++y) {
262  const double tmp_Ny = N_y[y];
263  for (std::size_t x = 0; x < X_size; ++x, ++xy) {
264  const double tmp1 = (tmp_Ny * N_x[x]) / N;
265  if (tmp1) {
266  const double tmp2 = N_xyz[xy] - tmp1;
267  cumulStat += (tmp2 * tmp2) / tmp1;
268  }
269  }
270  }
271  }
272 
273  Size df = __chi2.degreesOfFreedom(var_x, var_y);
274  double pValue = __chi2.probaChi2(cumulStat, df);
275  return std::pair< double, double >(cumulStat, pValue);
276  }
277 
278 
280  template < template < typename > class ALLOC >
281  inline double IndepTestChi2< ALLOC >::_score(const IdSet< ALLOC >& idset) {
282  const auto& nodeId2cols = this->_counter.nodeId2Columns();
283  Idx var_x, var_y;
284  if (nodeId2cols.empty()) {
285  var_x = idset[0];
286  var_y = idset[1];
287  } else {
288  var_x = nodeId2cols.second(idset[0]);
289  var_y = nodeId2cols.second(idset[1]);
290  }
291 
292  auto stat = _statistics(idset); // stat contains pair(Chi2stat,pValue)
293  double score = stat.first;
294 
295  // ok, here, score contains the value of the chi2 formula.
296  // To get a meaningful score, we shall compute the critical values
297  // for the Chi2 distribution and assign as the score of
298  // (score - alpha ) / alpha, where alpha is the critical value
299  const double alpha = __chi2.criticalValue(var_x, var_y);
300  score = (score - alpha) / alpha;
301 
302  return score;
303  }
304 
305  } /* namespace learning */
306 
307 } /* namespace gum */
308 
309 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
std::pair< double, double > _statistics(const IdSet< ALLOC > &idset)
compute the pair <chi2 statistic,pvalue>
virtual IndepTestChi2< ALLOC > * clone() const
virtual copy constructor
ALLOC< NodeId > allocator_type
type for the allocators passed in arguments of methods
Definition: indepTestChi2.h:48
IndepTestChi2(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
std::vector< double, ALLOC< double > > _marginalize(const std::size_t node_2_marginalize, const std::size_t X_size, const std::size_t Y_size, const std::size_t Z_size, const std::vector< double, ALLOC< double > > &N_xyz) const
returns a counting vector where variables are marginalized from N_xyz
IndepTestChi2< ALLOC > & operator=(const IndepTestChi2< ALLOC > &from)
copy operator
RecordCounter< ALLOC > _counter
the record counter used for the countings over discrete variables
STL namespace.
const DatabaseTable< ALLOC > & database() const
return the database used by the score
A class used by learning caches to represent uniquely sets of variables.
gum is the global namespace for all aGrUM entities
Definition: agrum.h:25
std::pair< double, double > statistics(NodeId var1, NodeId var2, const std::vector< NodeId, ALLOC< NodeId > > &rhs_ids={})
get the pair <chi2 statistic,pvalue> for a test var1 indep var2 given rhs_ids
IndependenceTest(const DBRowGeneratorParser< ALLOC > &parser, const Apriori< ALLOC > &external_apriori, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
allocator_type getAllocator() const
returns the allocator used by the score
IndependenceTest< ALLOC > & operator=(const IndependenceTest< ALLOC > &from)
copy operator
double score(const NodeId var1, const NodeId var2)
returns the score of a pair of nodes
virtual double _score(const IdSet< ALLOC > &idset) final
returns the score for a given IdSet
virtual ~IndepTestChi2()
destructor
std::size_t Size
In aGrUM, hashed values are unsigned long int.
Definition: types.h:45
Size NodeId
Type for node ids.
Definition: graphElements.h:97
Apriori< ALLOC > * _apriori
the expert knowledge a priori we add to the contongency tables