aGrUM
0.20.3
a C++ library for (probabilistic) graphical models
indepTestG2_tpl.h
Go to the documentation of this file.
1
/**
2
*
3
* Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4
* info_at_agrum_dot_org
5
*
6
* This library is free software: you can redistribute it and/or modify
7
* it under the terms of the GNU Lesser General Public License as published by
8
* the Free Software Foundation, either version 3 of the License, or
9
* (at your option) any later version.
10
*
11
* This library is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public License
17
* along with this library. If not, see <http://www.gnu.org/licenses/>.
18
*
19
*/
20
21
22
/** @file
23
* @brief the class for computing G2 scores
24
*
25
* @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26
*/
27
28
#
ifndef
DOXYGEN_SHOULD_SKIP_THIS
29
30
namespace
gum
{
31
32
namespace
learning
{
33
34
/// default constructor
35
template
<
template
<
typename
>
class
ALLOC >
36
INLINE IndepTestG2<
ALLOC
>::
IndepTestG2
(
37
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
38
const
Apriori
<
ALLOC
>&
apriori
,
39
const
std
::
vector
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
>,
40
ALLOC
<
std
::
pair
<
std
::
size_t
,
std
::
size_t
> > >&
ranges
,
41
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
nodeId2columns
,
42
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
43
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
ranges
,
nodeId2columns
,
alloc
),
44
_domain_sizes_
(
parser
.
database
().
domainSizes
()),
_chi2_
(
_domain_sizes_
) {
45
GUM_CONSTRUCTOR
(
IndepTestG2
);
46
}
47
48
49
/// default constructor
50
template
<
template
<
typename
>
class
ALLOC
>
51
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
52
const
DBRowGeneratorParser
<
ALLOC
>&
parser
,
53
const
Apriori
<
ALLOC
>&
apriori
,
54
const
Bijection
<
NodeId
,
std
::
size_t
,
ALLOC
<
std
::
size_t
> >&
nodeId2columns
,
55
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
56
IndependenceTest
<
ALLOC
>(
parser
,
apriori
,
nodeId2columns
,
alloc
),
57
_domain_sizes_
(
parser
.
database
().
domainSizes
()),
_chi2_
(
_domain_sizes_
) {
58
GUM_CONSTRUCTOR
(
IndepTestG2
);
59
}
60
61
62
/// copy constructor with a given allocator
63
template
<
template
<
typename
>
class
ALLOC
>
64
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
65
const
IndepTestG2
<
ALLOC
>&
from
,
66
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
67
IndependenceTest
<
ALLOC
>(
from
,
alloc
),
68
_chi2_
(
_domain_sizes_
) {
69
GUM_CONS_CPY
(
IndepTestG2
);
70
}
71
72
73
/// copy constructor
74
template
<
template
<
typename
>
class
ALLOC
>
75
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
const
IndepTestG2
<
ALLOC
>&
from
) :
76
IndepTestG2
<
ALLOC
>(
from
,
from
.
getAllocator
()) {}
77
78
79
/// move constructor with a given allocator
80
template
<
template
<
typename
>
class
ALLOC
>
81
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
82
IndepTestG2
<
ALLOC
>&&
from
,
83
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
) :
84
IndependenceTest
<
ALLOC
>(
std
::
move
(
from
),
alloc
),
85
_domain_sizes_
(
from
.
_domain_sizes_
),
_chi2_
(
_domain_sizes_
) {
86
GUM_CONS_MOV
(
IndepTestG2
);
87
}
88
89
90
/// move constructor
91
template
<
template
<
typename
>
class
ALLOC
>
92
INLINE
IndepTestG2
<
ALLOC
>::
IndepTestG2
(
IndepTestG2
<
ALLOC
>&&
from
) :
93
IndepTestG2
<
ALLOC
>(
std
::
move
(
from
),
from
.
getAllocator
()) {}
94
95
96
/// virtual copy constructor with a given allocator
97
template
<
template
<
typename
>
class
ALLOC
>
98
IndepTestG2
<
ALLOC
>*
IndepTestG2
<
ALLOC
>::
clone
(
99
const
typename
IndepTestG2
<
ALLOC
>::
allocator_type
&
alloc
)
const
{
100
ALLOC
<
IndepTestG2
<
ALLOC
> >
allocator
(
alloc
);
101
IndepTestG2
<
ALLOC
>*
new_score
=
allocator
.
allocate
(1);
102
try
{
103
allocator
.
construct
(
new_score
, *
this
,
alloc
);
104
}
catch
(...) {
105
allocator
.
deallocate
(
new_score
, 1);
106
throw
;
107
}
108
109
return
new_score
;
110
}
111
112
113
/// virtual copy constructor
114
template
<
template
<
typename
>
class
ALLOC
>
115
IndepTestG2
<
ALLOC
>*
IndepTestG2
<
ALLOC
>::
clone
()
const
{
116
return
clone
(
this
->
getAllocator
());
117
}
118
119
120
/// destructor
121
template
<
template
<
typename
>
class
ALLOC
>
122
IndepTestG2
<
ALLOC
>::~
IndepTestG2
() {
123
GUM_DESTRUCTOR
(
IndepTestG2
);
124
}
125
126
127
/// copy operator
128
template
<
template
<
typename
>
class
ALLOC
>
129
IndepTestG2
<
ALLOC
>&
IndepTestG2
<
ALLOC
>::
operator
=(
const
IndepTestG2
<
ALLOC
>&
from
) {
130
if
(
this
!= &
from
) {
131
IndependenceTest
<
ALLOC
>::
operator
=(
from
);
132
// __chi2 = from. _chi2_;
133
}
134
return
*
this
;
135
}
136
137
138
/// move operator
139
template
<
template
<
typename
>
class
ALLOC
>
140
IndepTestG2
<
ALLOC
>&
IndepTestG2
<
ALLOC
>::
operator
=(
IndepTestG2
<
ALLOC
>&&
from
) {
141
if
(
this
!= &
from
) {
142
IndependenceTest
<
ALLOC
>::
operator
=(
std
::
move
(
from
));
143
// __chi2 = std::move(from. _chi2_);
144
}
145
return
*
this
;
146
}
147
148
/// returns the pair <statistics,pvalue> corresponding to a given IdCondSet
149
template
<
template
<
typename
>
class
ALLOC
>
150
std
::
pair
<
double
,
double
>
151
IndepTestG2
<
ALLOC
>::
statistics
(
NodeId
var1
,
152
NodeId
var2
,
153
const
std
::
vector
<
NodeId
,
ALLOC
<
NodeId
> >&
rhs_ids
) {
154
return
statistics_
(
IdCondSet
<
ALLOC
>(
var1
,
var2
,
rhs_ids
,
false
));
155
}
156
157
/// returns the score corresponding to a given nodeset
158
template
<
template
<
typename
>
class
ALLOC
>
159
std
::
pair
<
double
,
double
>
IndepTestG2
<
ALLOC
>::
statistics_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
160
// get the countings
161
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xyz
(
this
->
counter_
.
counts
(
idset
,
true
));
162
const
bool
informative_external_apriori
=
this
->
apriori_
->
isInformative
();
163
if
(
informative_external_apriori
)
this
->
apriori_
->
addAllApriori
(
idset
,
N_xyz
);
164
const
std
::
size_t
all_size
= (
N_xyz
.
size
());
165
166
// compute the domain sizes of X and Y
167
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
168
const
auto
&
database
=
this
->
counter_
.
database
();
169
Idx
var_x
,
var_y
;
170
if
(
nodeId2cols
.
empty
()) {
171
var_x
=
idset
[0];
172
var_y
=
idset
[1];
173
}
else
{
174
var_x
=
nodeId2cols
.
second
(
idset
[0]);
175
var_y
=
nodeId2cols
.
second
(
idset
[1]);
176
}
177
178
const
std
::
size_t
X_size
=
database
.
domainSize
(
var_x
);
179
const
std
::
size_t
Y_size
=
database
.
domainSize
(
var_y
);
180
181
double
cumulStat
= 0.0;
182
183
// here, we distinguish idsets with conditioning nodes from those
184
// without conditioning nodes
185
if
(
idset
.
hasConditioningSet
()) {
186
const
std
::
size_t
Z_size
=
all_size
/ (
X_size
*
Y_size
);
187
188
// get the counts for the conditioning nodes
189
std
::
vector
<
double
,
ALLOC
<
double
> >
N_xz
190
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
191
std
::
vector
<
double
,
ALLOC
<
double
> >
N_yz
192
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
193
std
::
vector
<
double
,
ALLOC
<
double
> >
N_z
194
=
this
->
marginalize_
(
std
::
size_t
(2),
X_size
,
Y_size
,
Z_size
,
N_xyz
);
195
196
// indicate to the chi2 distribution the set of conditioning nodes
197
std
::
vector
<
Idx
>
cond_nodes
;
198
cond_nodes
.
reserve
(
idset
.
nbRHSIds
());
199
{
200
const
auto
cond_idset
=
idset
.
conditionalIdCondSet
().
ids
();
201
if
(
nodeId2cols
.
empty
()) {
202
for
(
const
auto
node
:
cond_idset
)
203
cond_nodes
.
push_back
(
node
);
204
}
else
{
205
for
(
const
auto
node
:
cond_idset
)
206
cond_nodes
.
push_back
(
nodeId2cols
.
second
(
node
));
207
}
208
}
209
_chi2_
.
setConditioningNodes
(
cond_nodes
);
210
211
212
// now, perform :
213
// sum_X sum_Y sum_Z #XYZ * log ( ( #XYZ * #Z ) / ( #XZ * #YZ ) )
214
for
(
std
::
size_t
z
=
std
::
size_t
(0),
215
beg_xz
=
std
::
size_t
(0),
216
beg_yz
=
std
::
size_t
(0),
217
xyz
=
std
::
size_t
(0);
218
z
<
Z_size
;
219
++
z
,
beg_xz
+=
X_size
,
beg_yz
+=
Y_size
) {
220
if
(
N_z
[
z
] > 0) {
221
for
(
std
::
size_t
y
=
std
::
size_t
(0),
yz
=
beg_yz
;
y
<
Y_size
; ++
yz
, ++
y
) {
222
for
(
std
::
size_t
x
=
std
::
size_t
(0),
xz
=
beg_xz
;
x
<
X_size
; ++
xz
, ++
x
, ++
xyz
) {
223
const
double
tmp1
=
N_xyz
[
xyz
] *
N_z
[
z
];
224
const
double
tmp2
=
N_yz
[
yz
] *
N_xz
[
xz
];
225
if
((
tmp1
!= 0.0) && (
tmp2
!= 0.0)) {
226
cumulStat
+=
N_xyz
[
xyz
] *
std
::
log
(
tmp1
/
tmp2
);
227
}
228
}
229
}
230
}
else
{
// moving xyz out of the loops x,y when if N_z[z]==0
231
xyz
+=
X_size
*
Y_size
;
232
}
233
}
234
}
else
{
235
// here, there is no conditioning set
236
237
// indicate to the chi2 distribution the set of conditioning nodes
238
_chi2_
.
setConditioningNodes
(
_empty_set_
);
239
240
// now, perform sum_X sum_Y #XY * log ( ( #XY * N ) / ( #X * #Y ) )
241
242
// get the counts for all the targets and for the conditioning nodes
243
std
::
vector
<
double
,
ALLOC
<
double
> >
N_x
244
=
this
->
marginalize_
(
std
::
size_t
(1),
X_size
,
Y_size
,
std
::
size_t
(1),
N_xyz
);
245
std
::
vector
<
double
,
ALLOC
<
double
> >
N_y
246
=
this
->
marginalize_
(
std
::
size_t
(0),
X_size
,
Y_size
,
std
::
size_t
(1),
N_xyz
);
247
248
// count N
249
double
N
= 0.0;
250
for
(
auto
n_x
:
N_x
)
251
N
+=
n_x
;
252
253
for
(
std
::
size_t
y
=
std
::
size_t
(0),
xy
= 0;
y
<
Y_size
; ++
y
) {
254
const
double
tmp_Ny
=
N_y
[
y
];
255
for
(
std
::
size_t
x
= 0;
x
<
X_size
; ++
x
, ++
xy
) {
256
const
double
tmp
= (
tmp_Ny
*
N_x
[
x
]);
257
if
((
tmp
!= 0.0) && (
N_xyz
[
xy
] != 0.0)) {
258
cumulStat
+=
N_xyz
[
xy
] *
std
::
log
((
N_xyz
[
xy
] *
N
) /
tmp
);
259
}
260
}
261
}
262
}
263
264
// used to make the G test formula asymptotically equivalent
265
// to the Pearson's chi-squared test formula
266
cumulStat
*= 2;
267
268
Size
df
=
_chi2_
.
degreesOfFreedom
(
var_x
,
var_y
);
269
double
pValue
=
_chi2_
.
probaChi2
(
cumulStat
,
df
);
270
return
std
::
pair
<
double
,
double
>(
cumulStat
,
pValue
);
271
}
272
273
/// returns the score corresponding to a given nodeset
274
template
<
template
<
typename
>
class
ALLOC
>
275
double
IndepTestG2
<
ALLOC
>::
score_
(
const
IdCondSet
<
ALLOC
>&
idset
) {
276
// compute the domain sizes of X and Y
277
const
auto
&
nodeId2cols
=
this
->
counter_
.
nodeId2Columns
();
278
Idx
var_x
,
var_y
;
279
if
(
nodeId2cols
.
empty
()) {
280
var_x
=
idset
[0];
281
var_y
=
idset
[1];
282
}
else
{
283
var_x
=
nodeId2cols
.
second
(
idset
[0]);
284
var_y
=
nodeId2cols
.
second
(
idset
[1]);
285
}
286
287
auto
stat
=
statistics_
(
idset
);
288
double
score
=
stat
.
first
;
289
290
// ok, here, score contains the value of the chi2 formula.
291
// To get a meaningful score, we shall compute the critical values
292
// for the Chi2 distribution and assign as the score of
293
// (score - alpha ) / alpha, where alpha is the critical value
294
const
double
alpha
=
_chi2_
.
criticalValue
(
var_x
,
var_y
);
295
score
= (
score
-
alpha
) /
alpha
;
296
297
return
score
;
298
}
299
300
}
/* namespace learning */
301
302
}
/* namespace gum */
303
304
#
endif
/* DOXYGEN_SHOULD_SKIP_THIS */
gum::Set::emplace
INLINE void emplace(Args &&... args)
Definition:
set_tpl.h:643
gum::learning::genericBNLearner::Database::Database
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)
Definition:
genericBNLearner_tpl.h:31