aGrUM
0.20.3
a C++ library for (probabilistic) graphical models
indepTestChi2_tpl.h
Go to the documentation of this file.
1
/**
2
*
3
* Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4
* info_at_agrum_dot_org
5
*
6
* This library is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU Lesser General Public License as published by
8
* the Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
10
*
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public License
17
* along with this library. If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
22
/** @file
23
* @brief the class for computing Chi2 scores
24
*
25
* @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26
*/
27
28
#
ifndef
DOXYGEN_SHOULD_SKIP_THIS
29
30
#
include
<
agrum
/
tools
/
stattests
/
idCondSet
.
h
>
31
32
namespace
gum
{
33
34
namespace
learning
{
35
36
/// default constructor
37
template
<
template
<
typename
>
class
ALLOC >
38
INLINE IndepTestChi2<
ALLOC
>::
IndepTestChi2
(
39
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
40
const
Apriori
<
ALLOC
>&
apriori
,
41
const
std
::
vector
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
>,
42
ALLOC
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
> > >&
ranges
,
43
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
nodeId2columns
,
44
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
45
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
ranges
,
nodeId2columns
,
alloc
),
46
_domain_sizes_
(
parser
.
database
().
domainSizes
()),
_chi2_
(
_domain_sizes_
) {
47
GUM_CONSTRUCTOR
(
IndepTestChi2
);
48
}
49
50
51
/// default constructor
52
template
<
template
<
typename
>
class
ALLOC
>
53
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
54
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
55
const
Apriori
<
ALLOC
>&
apriori
,
56
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
nodeId2columns
,
57
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
58
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
nodeId2columns
,
alloc
),
59
_domain_sizes_
(
parser
.
database
().
domainSizes
()),
_chi2_
(
_domain_sizes_
) {
60
GUM_CONSTRUCTOR
(
IndepTestChi2
);
61
}
62
63
64
/// copy constructor with a given allocator
65
template
<
template
<
typename
>
class
ALLOC
>
66
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
67
const
IndepTestChi2
<
ALLOC
>&
from
,
68
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
69
IndependenceTest
<
ALLOC
>(
from
,
alloc
),
70
_domain_sizes_
(
from
.
_domain_sizes_
),
_chi2_
(
_domain_sizes_
) {
71
GUM_CONS_CPY
(
IndepTestChi2
);
72
}
73
74
75
/// copy constructor
76
template
<
template
<
typename
>
class
ALLOC
>
77
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
const
IndepTestChi2
<
ALLOC
>&
from
) :
78
IndepTestChi2
<
ALLOC
>(
from
,
from
.
getAllocator
()) {}
79
80
81
/// move constructor with a given allocator
82
template
<
template
<
typename
>
class
ALLOC
>
83
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
84
IndepTestChi2
<
ALLOC
>&&
from
,
85
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
86
IndependenceTest
<
ALLOC
>(
std
::
move
(
from
),
alloc
),
87
_domain_sizes_
(
from
.
_domain_sizes_
),
_chi2_
(
_domain_sizes_
) {
88
GUM_CONS_MOV
(
IndepTestChi2
);
89
}
90
91
92
/// move constructor
93
template
<
template
<
typename
>
class
ALLOC
>
94
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
IndepTestChi2
<
ALLOC
>&&
from
) :
95
IndepTestChi2
<
ALLOC
>(
std
::
move
(
from
),
from
.
getAllocator
()) {}
96
97
98
/// virtual copy constructor with a given allocator
99
template
<
template
<
typename
>
class
ALLOC
>
100
IndepTestChi2
<
ALLOC
>*
IndepTestChi2
<
ALLOC
>::
clone
(
101
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
)
const
{
102
ALLOC
<
IndepTestChi2
<
ALLOC
> >
allocator
(
alloc
);
103
IndepTestChi2
<
ALLOC
>*
new_score
=
allocator
.
allocate
(1);
104
try
{
105
allocator
.
construct
(
new_score
, *
this
,
alloc
);
106
}
catch
(...) {
107
allocator
.
deallocate
(
new_score
, 1);
108
throw
;
109
}
110
111
return
new_score
;
112
}
113
114
115
/// virtual copy constructor
116
template
<
template
<
typename
>
class
ALLOC
>
117
IndepTestChi2
<
ALLOC
>*
IndepTestChi2
<
ALLOC
>::
clone
()
const
{
118
return
clone
(
this
->
getAllocator
());
119
}
120
121
122
/// destructor
123
template
<
template
<
typename
>
class
ALLOC
>
124
IndepTestChi2
<
ALLOC
>::~
IndepTestChi2
() {
125
GUM_DESTRUCTOR
(
IndepTestChi2
);
126
}
127
128
129
/// copy operator
130
template
<
template
<
typename
>
class
ALLOC
>
131
IndepTestChi2
<
ALLOC
>&
IndepTestChi2
<
ALLOC
>::
operator
=(
const
IndepTestChi2
<
ALLOC
>&
from
) {
132
if
(
this
!= &
from
) {
133
IndependenceTest
<
ALLOC
>::
operator
=(
from
);
134
// __chi2 = from. _chi2_;
135
}
136
return
*
this
;
137
}
138
139
140
/// move operator
141
template
<
template
<
typename
>
class
ALLOC
>
142
IndepTestChi2
<
ALLOC
>&
IndepTestChi2
<
ALLOC
>::
operator
=(
IndepTestChi2
<
ALLOC
>&&
from
) {
143
if
(
this
!= &
from
) {
144
IndependenceTest
<
ALLOC
>::
operator
=(
std
::
move
(
from
));
145
// __chi2 = std::move(from. _chi2_);
146
}
147
return
*
this
;
148
}
149
150
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
151
template
<
template
<
typename
>
class
ALLOC
>
152
std
::
pair
<
double
,
double
>
153
IndepTestChi2
<
ALLOC
>::
statistics
(
NodeId
var1
,
154
NodeId
var2
,
155
const
std
::
vector
<
NodeId
,
ALLOC
<
NodeId
> >&
rhs_ids
) {
156
return
statistics_
(
IdCondSet
<
ALLOC
>(
var1
,
var2
,
rhs_ids
,
false
));
157
}
158
159
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
160
template
<
template
<
typename
>
class
ALLOC
>
161
std
::
pair
<
double
,
double
>
162
IndepTestChi2
<
ALLOC
>::
statistics_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
163
// get the countings
164
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xyz
(
this
->
counter_
.
counts
(
idset
,
true
));
165
const
bool
informative_external_apriori
=
this
->
apriori_
->
isInformative
();
166
if
(
informative_external_apriori
)
this
->
apriori_
->
addAllApriori
(
idset
,
N_xyz
);
167
const
std
::
size_t
all_size
= (
N_xyz
.
size
());
168
169
// compute the domain sizes of X and Y
170
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
171
const
auto
&
database
=
this
->
counter_
.
database
();
172
Idx
var_x
,
var_y
;
173
if
(
nodeId2cols
.
empty
()) {
174
var_x
=
idset
[0];
175
var_y
=
idset
[1];
176
}
else
{
177
var_x
=
nodeId2cols
.
second
(
idset
[0]);
178
var_y
=
nodeId2cols
.
second
(
idset
[1]);
179
}
180
181
const
std
::
size_t
X_size
=
database
.
domainSize
(
var_x
);
182
const
std
::
size_t
Y_size
=
database
.
domainSize
(
var_y
);
183
184
double
cumulStat
= 0;
185
186
// here, we distinguish idsets with conditioning nodes from those
187
// without conditioning nodes
188
if
(
idset
.
hasConditioningSet
()) {
189
const
std
::
size_t
Z_size
=
all_size
/ (
X_size
*
Y_size
);
190
191
// get the counts for the conditioning nodes
192
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xz
193
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
194
std
::
vector
<
double
,
ALLOC
<
double
> >
N_yz
195
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
196
std
::
vector
<
double
,
ALLOC
<
double
> >
N_z
197
=
this
->
marginalize_
(
std
::
size_t
(2),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
198
199
// indicate to the chi2 distribution the set of conditioning nodes
200
std
::
vector
<
Idx
>
cond_nodes
;
201
cond_nodes
.
reserve
(
idset
.
nbRHSIds
());
202
{
203
const
auto
cond_idset
=
idset
.
conditionalIdCondSet
().
ids
();
204
if
(
nodeId2cols
.
empty
()) {
205
for
(
const
auto
node
:
cond_idset
)
206
cond_nodes
.
push_back
(
node
);
207
}
else
{
208
for
(
const
auto
node
:
cond_idset
)
209
cond_nodes
.
push_back
(
nodeId2cols
.
second
(
node
));
210
}
211
}
212
_chi2_
.
setConditioningNodes
(
cond_nodes
);
213
214
215
// now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
216
// (#ZX * #ZY / #Z )
217
for
(
std
::
size_t
z
=
std
::
size_t
(0),
218
beg_xz
=
std
::
size_t
(0),
219
beg_yz
=
std
::
size_t
(0),
220
xyz
=
std
::
size_t
(0);
221
z
<
Z_size
;
222
++
z
,
beg_xz
+=
X_size
,
beg_yz
+=
Y_size
) {
223
if
(
N_z
[
z
] > 0) {
224
for
(
std
::
size_t
y
=
std
::
size_t
(0),
yz
=
beg_yz
;
y
<
Y_size
; ++
yz
, ++
y
) {
225
for
(
std
::
size_t
x
=
std
::
size_t
(0),
xz
=
beg_xz
;
x
<
X_size
; ++
xz
, ++
x
, ++
xyz
) {
226
const
double
tmp1
= (
N_yz
[
yz
] *
N_xz
[
xz
]) /
N_z
[
z
];
227
if
(
tmp1
!= 0.0) {
228
const
double
tmp2
=
N_xyz
[
xyz
] -
tmp1
;
229
cumulStat
+= (
tmp2
*
tmp2
) /
tmp1
;
230
}
231
}
232
}
233
}
else
{
// moving xyz out of the loops x,y when if N_z[z]==0
234
xyz
+=
X_size
*
Y_size
;
235
}
236
}
237
}
else
{
238
// here, there is no conditioning set
239
240
// indicate to the chi2 distribution the set of conditioning nodes
241
_chi2_
.
setConditioningNodes
(
_empty_set_
);
242
243
// now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
244
245
// get the counts for all the targets and for the conditioning nodes
246
std
::
vector
<
double
,
ALLOC
<
double
> >
N_x
247
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
std
::
size_t
(1),
N_xyz
);
248
std
::
vector
<
double
,
ALLOC
<
double
> >
N_y
249
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
std
::
size_t
(1),
N_xyz
);
250
251
// count N
252
double
N
= 0.0;
253
for
(
const
auto
n_x
:
N_x
)
254
N
+=
n_x
;
255
256
for
(
std
::
size_t
y
=
std
::
size_t
(0),
xy
= 0;
y
<
Y_size
; ++
y
) {
257
const
double
tmp_Ny
=
N_y
[
y
];
258
for
(
std
::
size_t
x
= 0;
x
<
X_size
; ++
x
, ++
xy
) {
259
const
double
tmp1
= (
tmp_Ny
*
N_x
[
x
]) /
N
;
260
if
(
tmp1
!= 0.0) {
261
const
double
tmp2
=
N_xyz
[
xy
] -
tmp1
;
262
cumulStat
+= (
tmp2
*
tmp2
) /
tmp1
;
263
}
264
}
265
}
266
}
267
268
Size
df
=
_chi2_
.
degreesOfFreedom
(
var_x
,
var_y
);
269
double
pValue
=
_chi2_
.
probaChi2
(
cumulStat
,
df
);
270
return
std
::
pair
<
double
,
double
>(
cumulStat
,
pValue
);
271
}
272
273
274
/// returns the score corresponding to a given nodeset
275
template
<
template
<
typename
>
class
ALLOC
>
276
inline
double
IndepTestChi2
<
ALLOC
>::
score_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
277
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
278
Idx
var_x
,
var_y
;
279
if
(
nodeId2cols
.
empty
()) {
280
var_x
=
idset
[0];
281
var_y
=
idset
[1];
282
}
else
{
283
var_x
=
nodeId2cols
.
second
(
idset
[0]);
284
var_y
=
nodeId2cols
.
second
(
idset
[1]);
285
}
286
287
auto
stat
=
statistics_
(
idset
);
// stat contains pair(Chi2stat,pValue)
288
double
score
=
stat
.
first
;
289
290
// ok, here, score contains the value of the chi2 formula.
291
// To get a meaningful score, we shall compute the critical values
292
// for the Chi2 distribution and assign as the score of
293
// (score - alpha ) / alpha, where alpha is the critical value
294
const
double
alpha
=
_chi2_
.
criticalValue
(
var_x
,
var_y
);
295
score
= (
score
-
alpha
) /
alpha
;
296
297
return
score
;
298
}
299
300
}
/* namespace learning */
301
302
}
/* namespace gum */
303
304
#
endif
/* DOXYGEN_SHOULD_SKIP_THIS */
gum::Set::emplace
INLINE void emplace(Args &&... args)
Definition:
set_tpl.h:643
gum::learning::genericBNLearner::Database::Database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
Definition:
genericBNLearner_tpl.h:31