aGrUM
0.20.2
a C++ library for (probabilistic) graphical models
indepTestChi2_tpl.h
Go to the documentation of this file.
1
/**
2
*
3
* Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4
* info_at_agrum_dot_org
5
*
6
* This library is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU Lesser General Public License as published by
8
* the Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
10
*
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public License
17
* along with this library. If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
22
/** @file
23
* @brief the class for computing Chi2 scores
24
*
25
* @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26
*/
27
28
#
ifndef
DOXYGEN_SHOULD_SKIP_THIS
29
30
#
include
<
agrum
/
tools
/
stattests
/
idCondSet
.
h
>
31
32
namespace
gum
{
33
34
namespace
learning
{
35
36
/// default constructor
37
template
<
template
<
typename
>
class
ALLOC >
38
INLINE IndepTestChi2<
ALLOC
>::
IndepTestChi2
(
39
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
40
const
Apriori
<
ALLOC
>&
apriori
,
41
const
std
::
vector
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
>,
42
ALLOC
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
> > >&
ranges
,
43
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
44
nodeId2columns
,
45
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
46
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
ranges
,
nodeId2columns
,
alloc
),
47
domain_sizes__
(
parser
.
database
().
domainSizes
()),
chi2__
(
domain_sizes__
) {
48
GUM_CONSTRUCTOR
(
IndepTestChi2
);
49
}
50
51
52
/// default constructor
53
template
<
template
<
typename
>
class
ALLOC
>
54
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
55
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
56
const
Apriori
<
ALLOC
>&
apriori
,
57
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
58
nodeId2columns
,
59
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
60
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
nodeId2columns
,
alloc
),
61
domain_sizes__
(
parser
.
database
().
domainSizes
()),
chi2__
(
domain_sizes__
) {
62
GUM_CONSTRUCTOR
(
IndepTestChi2
);
63
}
64
65
66
/// copy constructor with a given allocator
67
template
<
template
<
typename
>
class
ALLOC
>
68
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
69
const
IndepTestChi2
<
ALLOC
>&
from
,
70
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
71
IndependenceTest
<
ALLOC
>(
from
,
alloc
),
72
domain_sizes__
(
from
.
domain_sizes__
),
chi2__
(
domain_sizes__
) {
73
GUM_CONS_CPY
(
IndepTestChi2
);
74
}
75
76
77
/// copy constructor
78
template
<
template
<
typename
>
class
ALLOC
>
79
INLINE
80
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
const
IndepTestChi2
<
ALLOC
>&
from
) :
81
IndepTestChi2
<
ALLOC
>(
from
,
from
.
getAllocator
()) {}
82
83
84
/// move constructor with a given allocator
85
template
<
template
<
typename
>
class
ALLOC
>
86
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
87
IndepTestChi2
<
ALLOC
>&&
from
,
88
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
) :
89
IndependenceTest
<
ALLOC
>(
std
::
move
(
from
),
alloc
),
90
domain_sizes__
(
from
.
domain_sizes__
),
chi2__
(
domain_sizes__
) {
91
GUM_CONS_MOV
(
IndepTestChi2
);
92
}
93
94
95
/// move constructor
96
template
<
template
<
typename
>
class
ALLOC
>
97
INLINE
IndepTestChi2
<
ALLOC
>::
IndepTestChi2
(
IndepTestChi2
<
ALLOC
>&&
from
) :
98
IndepTestChi2
<
ALLOC
>(
std
::
move
(
from
),
from
.
getAllocator
()) {}
99
100
101
/// virtual copy constructor with a given allocator
102
template
<
template
<
typename
>
class
ALLOC
>
103
IndepTestChi2
<
ALLOC
>*
IndepTestChi2
<
ALLOC
>::
clone
(
104
const
typename
IndepTestChi2
<
ALLOC
>::
allocator_type
&
alloc
)
const
{
105
ALLOC
<
IndepTestChi2
<
ALLOC
> >
allocator
(
alloc
);
106
IndepTestChi2
<
ALLOC
>*
new_score
=
allocator
.
allocate
(1);
107
try
{
108
allocator
.
construct
(
new_score
, *
this
,
alloc
);
109
}
catch
(...) {
110
allocator
.
deallocate
(
new_score
, 1);
111
throw
;
112
}
113
114
return
new_score
;
115
}
116
117
118
/// virtual copy constructor
119
template
<
template
<
typename
>
class
ALLOC
>
120
IndepTestChi2
<
ALLOC
>*
IndepTestChi2
<
ALLOC
>::
clone
()
const
{
121
return
clone
(
this
->
getAllocator
());
122
}
123
124
125
/// destructor
126
template
<
template
<
typename
>
class
ALLOC
>
127
IndepTestChi2
<
ALLOC
>::~
IndepTestChi2
() {
128
GUM_DESTRUCTOR
(
IndepTestChi2
);
129
}
130
131
132
/// copy operator
133
template
<
template
<
typename
>
class
ALLOC
>
134
IndepTestChi2
<
ALLOC
>&
135
IndepTestChi2
<
ALLOC
>::
operator
=(
const
IndepTestChi2
<
ALLOC
>&
from
) {
136
if
(
this
!= &
from
) {
137
IndependenceTest
<
ALLOC
>::
operator
=(
from
);
138
//__chi2 = from.chi2__;
139
}
140
return
*
this
;
141
}
142
143
144
/// move operator
145
template
<
template
<
typename
>
class
ALLOC
>
146
IndepTestChi2
<
ALLOC
>&
147
IndepTestChi2
<
ALLOC
>::
operator
=(
IndepTestChi2
<
ALLOC
>&&
from
) {
148
if
(
this
!= &
from
) {
149
IndependenceTest
<
ALLOC
>::
operator
=(
std
::
move
(
from
));
150
//__chi2 = std::move(from.chi2__);
151
}
152
return
*
this
;
153
}
154
155
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
156
template
<
template
<
typename
>
class
ALLOC
>
157
std
::
pair
<
double
,
double
>
IndepTestChi2
<
ALLOC
>::
statistics
(
158
NodeId
var1
,
159
NodeId
var2
,
160
const
std
::
vector
<
NodeId
,
ALLOC
<
NodeId
> >&
rhs_ids
) {
161
return
statistics_
(
IdCondSet
<
ALLOC
>(
var1
,
var2
,
rhs_ids
,
false
));
162
}
163
164
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
165
template
<
template
<
typename
>
class
ALLOC
>
166
std
::
pair
<
double
,
double
>
167
IndepTestChi2
<
ALLOC
>::
statistics_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
168
// get the countings
169
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xyz
(
170
this
->
counter_
.
counts
(
idset
,
true
));
171
const
bool
informative_external_apriori
=
this
->
apriori_
->
isInformative
();
172
if
(
informative_external_apriori
)
173
this
->
apriori_
->
addAllApriori
(
idset
,
N_xyz
);
174
const
std
::
size_t
all_size
= (
N_xyz
.
size
());
175
176
// compute the domain sizes of X and Y
177
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
178
const
auto
&
database
=
this
->
counter_
.
database
();
179
Idx
var_x
,
var_y
;
180
if
(
nodeId2cols
.
empty
()) {
181
var_x
=
idset
[0];
182
var_y
=
idset
[1];
183
}
else
{
184
var_x
=
nodeId2cols
.
second
(
idset
[0]);
185
var_y
=
nodeId2cols
.
second
(
idset
[1]);
186
}
187
188
const
std
::
size_t
X_size
=
database
.
domainSize
(
var_x
);
189
const
std
::
size_t
Y_size
=
database
.
domainSize
(
var_y
);
190
191
double
cumulStat
= 0;
192
193
// here, we distinguish idsets with conditioning nodes from those
194
// without conditioning nodes
195
if
(
idset
.
hasConditioningSet
()) {
196
const
std
::
size_t
Z_size
=
all_size
/ (
X_size
*
Y_size
);
197
198
// get the counts for the conditioning nodes
199
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xz
200
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
201
std
::
vector
<
double
,
ALLOC
<
double
> >
N_yz
202
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
203
std
::
vector
<
double
,
ALLOC
<
double
> >
N_z
204
=
this
->
marginalize_
(
std
::
size_t
(2),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
205
206
// indicate to the chi2 distribution the set of conditioning nodes
207
std
::
vector
<
Idx
>
cond_nodes
;
208
cond_nodes
.
reserve
(
idset
.
nbRHSIds
());
209
{
210
const
auto
cond_idset
=
idset
.
conditionalIdCondSet
().
ids
();
211
if
(
nodeId2cols
.
empty
()) {
212
for
(
const
auto
node
:
cond_idset
)
213
cond_nodes
.
push_back
(
node
);
214
}
else
{
215
for
(
const
auto
node
:
cond_idset
)
216
cond_nodes
.
push_back
(
nodeId2cols
.
second
(
node
));
217
}
218
}
219
chi2__
.
setConditioningNodes
(
cond_nodes
);
220
221
222
// now, perform sum_X sum_Y sum_Z ( #ZYX - #ZX * #ZY / #Z )^2 /
223
// (#ZX * #ZY / #Z )
224
for
(
std
::
size_t
z
=
std
::
size_t
(0),
225
beg_xz
=
std
::
size_t
(0),
226
beg_yz
=
std
::
size_t
(0),
227
xyz
=
std
::
size_t
(0);
228
z
<
Z_size
;
229
++
z
,
beg_xz
+=
X_size
,
beg_yz
+=
Y_size
) {
230
if
(
N_z
[
z
] > 0) {
231
for
(
std
::
size_t
y
=
std
::
size_t
(0),
yz
=
beg_yz
;
y
<
Y_size
;
232
++
yz
, ++
y
) {
233
for
(
std
::
size_t
x
=
std
::
size_t
(0),
xz
=
beg_xz
;
x
<
X_size
;
234
++
xz
, ++
x
, ++
xyz
) {
235
const
double
tmp1
= (
N_yz
[
yz
] *
N_xz
[
xz
]) /
N_z
[
z
];
236
if
(
tmp1
!= 0.0) {
237
const
double
tmp2
=
N_xyz
[
xyz
] -
tmp1
;
238
cumulStat
+= (
tmp2
*
tmp2
) /
tmp1
;
239
}
240
}
241
}
242
}
else
{
// moving xyz out of the loops x,y when if N_z[z]==0
243
xyz
+=
X_size
*
Y_size
;
244
}
245
}
246
}
else
{
247
// here, there is no conditioning set
248
249
// indicate to the chi2 distribution the set of conditioning nodes
250
chi2__
.
setConditioningNodes
(
empty_set__
);
251
252
// now, perform sum_X sum_Y ( #XY - (#X * #Y) / N )^2 / (#X * #Y )/N
253
254
// get the counts for all the targets and for the conditioning nodes
255
std
::
vector
<
double
,
ALLOC
<
double
> >
N_x
256
=
this
->
marginalize_
(
std
::
size_t
(1),
257
X_size
,
258
Y_size
,
259
std
::
size_t
(1),
260
N_xyz
);
261
std
::
vector
<
double
,
ALLOC
<
double
> >
N_y
262
=
this
->
marginalize_
(
std
::
size_t
(0),
263
X_size
,
264
Y_size
,
265
std
::
size_t
(1),
266
N_xyz
);
267
268
// count N
269
double
N
= 0.0;
270
for
(
const
auto
n_x
:
N_x
)
271
N
+=
n_x
;
272
273
for
(
std
::
size_t
y
=
std
::
size_t
(0),
xy
= 0;
y
<
Y_size
; ++
y
) {
274
const
double
tmp_Ny
=
N_y
[
y
];
275
for
(
std
::
size_t
x
= 0;
x
<
X_size
; ++
x
, ++
xy
) {
276
const
double
tmp1
= (
tmp_Ny
*
N_x
[
x
]) /
N
;
277
if
(
tmp1
!= 0.0) {
278
const
double
tmp2
=
N_xyz
[
xy
] -
tmp1
;
279
cumulStat
+= (
tmp2
*
tmp2
) /
tmp1
;
280
}
281
}
282
}
283
}
284
285
Size
df
=
chi2__
.
degreesOfFreedom
(
var_x
,
var_y
);
286
double
pValue
=
chi2__
.
probaChi2
(
cumulStat
,
df
);
287
return
std
::
pair
<
double
,
double
>(
cumulStat
,
pValue
);
288
}
289
290
291
/// returns the score corresponding to a given nodeset
292
template
<
template
<
typename
>
class
ALLOC
>
293
inline
double
IndepTestChi2
<
ALLOC
>::
score_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
294
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
295
Idx
var_x
,
var_y
;
296
if
(
nodeId2cols
.
empty
()) {
297
var_x
=
idset
[0];
298
var_y
=
idset
[1];
299
}
else
{
300
var_x
=
nodeId2cols
.
second
(
idset
[0]);
301
var_y
=
nodeId2cols
.
second
(
idset
[1]);
302
}
303
304
auto
stat
=
statistics_
(
idset
);
// stat contains pair(Chi2stat,pValue)
305
double
score
=
stat
.
first
;
306
307
// ok, here, score contains the value of the chi2 formula.
308
// To get a meaningful score, we shall compute the critical values
309
// for the Chi2 distribution and assign as the score of
310
// (score - alpha ) / alpha, where alpha is the critical value
311
const
double
alpha
=
chi2__
.
criticalValue
(
var_x
,
var_y
);
312
score
= (
score
-
alpha
) /
alpha
;
313
314
return
score
;
315
}
316
317
}
/* namespace learning */
318
319
}
/* namespace gum */
320
321
#
endif
/* DOXYGEN_SHOULD_SKIP_THIS */
gum::Set::emplace
INLINE void emplace(Args &&... args)
Definition:
set_tpl.h:669
gum::learning::genericBNLearner::Database::Database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
Definition:
genericBNLearner_tpl.h:31