aGrUM
0.20.2
a C++ library for (probabilistic) graphical models
indepTestG2_tpl.h
Go to the documentation of this file.
1
/**
2
*
3
* Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4
* info_at_agrum_dot_org
5
*
6
* This library is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU Lesser General Public License as published by
8
* the Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
10
*
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public License
17
* along with this library. If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
22
/** @file
23
* @brief the class for computing G2 scores
24
*
25
* @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26
*/
27
28
#
ifndef
DOXYGEN_SHOULD_SKIP_THIS
29
30
namespace
gum
{
31
32
namespace
learning
{
33
34
/// default constructor
35
template
<
template
<
typename
>
class
ALLOC >
36
INLINE IndepTestG2<
ALLOC
>::
IndepTestG2
(
37
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
38
const
Apriori
<
ALLOC
>&
apriori
,
39
const
std
::
vector
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
>,
40
ALLOC
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
> > >&
ranges
,
41
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
42
nodeId2columns
,
43
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
44
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
ranges
,
nodeId2columns
,
alloc
),
45
domain_sizes__
(
parser
.
database
().
domainSizes
()),
chi2__
(
domain_sizes__
) {
46
GUM_CONSTRUCTOR
(
IndepTestG2
);
47
}
48
49
50
/// default constructor
51
template
<
template
<
typename
>
class
ALLOC
>
52
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
53
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
54
const
Apriori
<
ALLOC
>&
apriori
,
55
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
56
nodeId2columns
,
57
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
58
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
nodeId2columns
,
alloc
),
59
domain_sizes__
(
parser
.
database
().
domainSizes
()),
chi2__
(
domain_sizes__
) {
60
GUM_CONSTRUCTOR
(
IndepTestG2
);
61
}
62
63
64
/// copy constructor with a given allocator
65
template
<
template
<
typename
>
class
ALLOC
>
66
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
67
const
IndepTestG2
<
ALLOC
>&
from
,
68
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
69
IndependenceTest
<
ALLOC
>(
from
,
alloc
),
70
chi2__
(
domain_sizes__
) {
71
GUM_CONS_CPY
(
IndepTestG2
);
72
}
73
74
75
/// copy constructor
76
template
<
template
<
typename
>
class
ALLOC
>
77
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
const
IndepTestG2
<
ALLOC
>&
from
) :
78
IndepTestG2
<
ALLOC
>(
from
,
from
.
getAllocator
()) {}
79
80
81
/// move constructor with a given allocator
82
template
<
template
<
typename
>
class
ALLOC
>
83
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
84
IndepTestG2
<
ALLOC
>&&
from
,
85
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
86
IndependenceTest
<
ALLOC
>(
std
::
move
(
from
),
alloc
),
87
domain_sizes__
(
from
.
domain_sizes__
),
chi2__
(
domain_sizes__
) {
88
GUM_CONS_MOV
(
IndepTestG2
);
89
}
90
91
92
/// move constructor
93
template
<
template
<
typename
>
class
ALLOC
>
94
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
IndepTestG2
<
ALLOC
>&&
from
) :
95
IndepTestG2
<
ALLOC
>(
std
::
move
(
from
),
from
.
getAllocator
()) {}
96
97
98
/// virtual copy constructor with a given allocator
99
template
<
template
<
typename
>
class
ALLOC
>
100
IndepTestG2
<
ALLOC
>*
IndepTestG2
<
ALLOC
>::
clone
(
101
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
)
const
{
102
ALLOC
<
IndepTestG2
<
ALLOC
> >
allocator
(
alloc
);
103
IndepTestG2
<
ALLOC
>*
new_score
=
allocator
.
allocate
(1);
104
try
{
105
allocator
.
construct
(
new_score
, *
this
,
alloc
);
106
}
catch
(...) {
107
allocator
.
deallocate
(
new_score
, 1);
108
throw
;
109
}
110
111
return
new_score
;
112
}
113
114
115
/// virtual copy constructor
116
template
<
template
<
typename
>
class
ALLOC
>
117
IndepTestG2
<
ALLOC
>*
IndepTestG2
<
ALLOC
>::
clone
()
const
{
118
return
clone
(
this
->
getAllocator
());
119
}
120
121
122
/// destructor
123
template
<
template
<
typename
>
class
ALLOC
>
124
IndepTestG2
<
ALLOC
>::~
IndepTestG2
() {
125
GUM_DESTRUCTOR
(
IndepTestG2
);
126
}
127
128
129
/// copy operator
130
template
<
template
<
typename
>
class
ALLOC
>
131
IndepTestG2
<
ALLOC
>&
132
IndepTestG2
<
ALLOC
>::
operator
=(
const
IndepTestG2
<
ALLOC
>&
from
) {
133
if
(
this
!= &
from
) {
134
IndependenceTest
<
ALLOC
>::
operator
=(
from
);
135
//__chi2 = from.chi2__;
136
}
137
return
*
this
;
138
}
139
140
141
/// move operator
142
template
<
template
<
typename
>
class
ALLOC
>
143
IndepTestG2
<
ALLOC
>&
144
IndepTestG2
<
ALLOC
>::
operator
=(
IndepTestG2
<
ALLOC
>&&
from
) {
145
if
(
this
!= &
from
) {
146
IndependenceTest
<
ALLOC
>::
operator
=(
std
::
move
(
from
));
147
//__chi2 = std::move(from.chi2__);
148
}
149
return
*
this
;
150
}
151
152
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
153
template
<
template
<
typename
>
class
ALLOC
>
154
std
::
pair
<
double
,
double
>
IndepTestG2
<
ALLOC
>::
statistics
(
155
NodeId
var1
,
156
NodeId
var2
,
157
const
std
::
vector
<
NodeId
,
ALLOC
<
NodeId
> >&
rhs_ids
) {
158
return
statistics_
(
IdCondSet
<
ALLOC
>(
var1
,
var2
,
rhs_ids
,
false
));
159
}
160
161
/// returns the score corresponding to a given nodeset
162
template
<
template
<
typename
>
class
ALLOC
>
163
std
::
pair
<
double
,
double
>
164
IndepTestG2
<
ALLOC
>::
statistics_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
165
// get the countings
166
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xyz
(
167
this
->
counter_
.
counts
(
idset
,
true
));
168
const
bool
informative_external_apriori
=
this
->
apriori_
->
isInformative
();
169
if
(
informative_external_apriori
)
170
this
->
apriori_
->
addAllApriori
(
idset
,
N_xyz
);
171
const
std
::
size_t
all_size
= (
N_xyz
.
size
());
172
173
// compute the domain sizes of X and Y
174
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
175
const
auto
&
database
=
this
->
counter_
.
database
();
176
Idx
var_x
,
var_y
;
177
if
(
nodeId2cols
.
empty
()) {
178
var_x
=
idset
[0];
179
var_y
=
idset
[1];
180
}
else
{
181
var_x
=
nodeId2cols
.
second
(
idset
[0]);
182
var_y
=
nodeId2cols
.
second
(
idset
[1]);
183
}
184
185
const
std
::
size_t
X_size
=
database
.
domainSize
(
var_x
);
186
const
std
::
size_t
Y_size
=
database
.
domainSize
(
var_y
);
187
188
double
cumulStat
= 0.0;
189
190
// here, we distinguish idsets with conditioning nodes from those
191
// without conditioning nodes
192
if
(
idset
.
hasConditioningSet
()) {
193
const
std
::
size_t
Z_size
=
all_size
/ (
X_size
*
Y_size
);
194
195
// get the counts for the conditioning nodes
196
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xz
197
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
198
std
::
vector
<
double
,
ALLOC
<
double
> >
N_yz
199
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
200
std
::
vector
<
double
,
ALLOC
<
double
> >
N_z
201
=
this
->
marginalize_
(
std
::
size_t
(2),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
202
203
// indicate to the chi2 distribution the set of conditioning nodes
204
std
::
vector
<
Idx
>
cond_nodes
;
205
cond_nodes
.
reserve
(
idset
.
nbRHSIds
());
206
{
207
const
auto
cond_idset
=
idset
.
conditionalIdCondSet
().
ids
();
208
if
(
nodeId2cols
.
empty
()) {
209
for
(
const
auto
node
:
cond_idset
)
210
cond_nodes
.
push_back
(
node
);
211
}
else
{
212
for
(
const
auto
node
:
cond_idset
)
213
cond_nodes
.
push_back
(
nodeId2cols
.
second
(
node
));
214
}
215
}
216
chi2__
.
setConditioningNodes
(
cond_nodes
);
217
218
219
// now, perform :
220
// sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
221
for
(
std
::
size_t
z
=
std
::
size_t
(0),
222
beg_xz
=
std
::
size_t
(0),
223
beg_yz
=
std
::
size_t
(0),
224
xyz
=
std
::
size_t
(0);
225
z
<
Z_size
;
226
++
z
,
beg_xz
+=
X_size
,
beg_yz
+=
Y_size
) {
227
if
(
N_z
[
z
] > 0) {
228
for
(
std
::
size_t
y
=
std
::
size_t
(0),
yz
=
beg_yz
;
y
<
Y_size
;
229
++
yz
, ++
y
) {
230
for
(
std
::
size_t
x
=
std
::
size_t
(0),
xz
=
beg_xz
;
x
<
X_size
;
231
++
xz
, ++
x
, ++
xyz
) {
232
const
double
tmp1
=
N_xyz
[
xyz
] *
N_z
[
z
];
233
const
double
tmp2
=
N_yz
[
yz
] *
N_xz
[
xz
];
234
if
((
tmp1
!= 0.0) && (
tmp2
!= 0.0)) {
235
cumulStat
+=
N_xyz
[
xyz
] *
std
::
log
(
tmp1
/
tmp2
);
236
}
237
}
238
}
239
}
else
{
// moving xyz out of the loops x,y when if N_z[z]==0
240
xyz
+=
X_size
*
Y_size
;
241
}
242
}
243
}
else
{
244
// here, there is no conditioning set
245
246
// indicate to the chi2 distribution the set of conditioning nodes
247
chi2__
.
setConditioningNodes
(
empty_set__
);
248
249
// now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
250
251
// get the counts for all the targets and for the conditioning nodes
252
std
::
vector
<
double
,
ALLOC
<
double
> >
N_x
253
=
this
->
marginalize_
(
std
::
size_t
(1),
254
X_size
,
255
Y_size
,
256
std
::
size_t
(1),
257
N_xyz
);
258
std
::
vector
<
double
,
ALLOC
<
double
> >
N_y
259
=
this
->
marginalize_
(
std
::
size_t
(0),
260
X_size
,
261
Y_size
,
262
std
::
size_t
(1),
263
N_xyz
);
264
265
// count N
266
double
N
= 0.0;
267
for
(
auto
n_x
:
N_x
)
268
N
+=
n_x
;
269
270
for
(
std
::
size_t
y
=
std
::
size_t
(0),
xy
= 0;
y
<
Y_size
; ++
y
) {
271
const
double
tmp_Ny
=
N_y
[
y
];
272
for
(
std
::
size_t
x
= 0;
x
<
X_size
; ++
x
, ++
xy
) {
273
const
double
tmp
= (
tmp_Ny
*
N_x
[
x
]);
274
if
((
tmp
!= 0.0) && (
N_xyz
[
xy
] != 0.0)) {
275
cumulStat
+=
N_xyz
[
xy
] *
std
::
log
((
N_xyz
[
xy
] *
N
) /
tmp
);
276
}
277
}
278
}
279
}
280
281
// used to make the G test formula asymptotically equivalent
282
// to the Pearson's chi-squared test formula
283
cumulStat
*= 2;
284
285
Size
df
=
chi2__
.
degreesOfFreedom
(
var_x
,
var_y
);
286
double
pValue
=
chi2__
.
probaChi2
(
cumulStat
,
df
);
287
return
std
::
pair
<
double
,
double
>(
cumulStat
,
pValue
);
288
}
289
290
/// returns the score corresponding to a given nodeset
291
template
<
template
<
typename
>
class
ALLOC
>
292
double
IndepTestG2
<
ALLOC
>::
score_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
293
// compute the domain sizes of X and Y
294
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
295
Idx
var_x
,
var_y
;
296
if
(
nodeId2cols
.
empty
()) {
297
var_x
=
idset
[0];
298
var_y
=
idset
[1];
299
}
else
{
300
var_x
=
nodeId2cols
.
second
(
idset
[0]);
301
var_y
=
nodeId2cols
.
second
(
idset
[1]);
302
}
303
304
auto
stat
=
statistics_
(
idset
);
305
double
score
=
stat
.
first
;
306
307
// ok, here, score contains the value of the chi2 formula.
308
// To get a meaningful score, we shall compute the critical values
309
// for the Chi2 distribution and assign as the score of
310
// (score - alpha ) / alpha, where alpha is the critical value
311
const
double
alpha
=
chi2__
.
criticalValue
(
var_x
,
var_y
);
312
score
= (
score
-
alpha
) /
alpha
;
313
314
return
score
;
315
}
316
317
}
/* namespace learning */
318
319
}
/* namespace gum */
320
321
#
endif
/* DOXYGEN_SHOULD_SKIP_THIS */
gum::Set::emplace
INLINE void emplace(Args &&... args)
Definition:
set_tpl.h:669
gum::learning::genericBNLearner::Database::Database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
Definition:
genericBNLearner_tpl.h:31