aGrUM  0.20.2
a C++ library for (probabilistic) graphical models
recordCounter_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright 2005-2020 Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class that computes countings of observations from the database.
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #include <agrum/tools/stattests/recordCounter.h>
29 
30 
31 #ifndef DOXYGEN_SHOULD_SKIP_THIS
32 
33 namespace gum {
34 
35  namespace learning {
36 
37 
38  /// returns the allocator used by the translator
39  template < template < typename > class ALLOC >
41  RecordCounter< ALLOC >::getAllocator() const {
42  return parsers__.get_allocator();
43  }
44 
45 
46  /// default constructor
47  template < template < typename > class ALLOC >
50  const std::vector< std::pair< std::size_t, std::size_t >,
51  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
52  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
54  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
59  // check that the columns in nodeId2columns do belong to the database
61  for (auto iter = nodeId2columns.cbegin(); iter != nodeId2columns.cend();
62  ++iter) {
63  if (iter.second() >= db_nb_cols) {
65  "the mapping between ids and database columns "
66  << "is incorrect because Column " << iter.second()
67  << " does not belong to the database.");
68  }
69  }
70 
71  // create the parsers. There should always be at least one parser
74  for (std::size_t i = std::size_t(0); i < max_nb_threads__; ++i)
76 
77  // check that the ranges are within the bounds of the database and
78  // save them
81  for (const auto& range: ranges)
83 
84  // dispatch the ranges for the threads
86 
88  }
89 
90 
91  /// default constructor
92  template < template < typename > class ALLOC >
95  const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
97  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
99  parser,
100  std::vector< std::pair< std::size_t, std::size_t >,
101  ALLOC< std::pair< std::size_t, std::size_t > > >(),
103  alloc) {}
104 
105 
106  /// copy constructor with a given allocator
107  template < template < typename > class ALLOC >
109  const RecordCounter< ALLOC >& from,
110  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
122  }
123 
124 
125  /// copy constructor
126  template < template < typename > class ALLOC >
129 
130 
131  /// move constructor with a given allocator
132  template < template < typename > class ALLOC >
134  RecordCounter< ALLOC >&& from,
135  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
147  }
148 
149 
150  /// move constructor
151  template < template < typename > class ALLOC >
154 
155 
156  /// virtual copy constructor with a given allocator
157  template < template < typename > class ALLOC >
159  const typename RecordCounter< ALLOC >::allocator_type& alloc) const {
162  try {
164  } catch (...) {
166  throw;
167  }
168 
169  return new_counter;
170  }
171 
172 
173  /// virtual copy constructor
174  template < template < typename > class ALLOC >
175  RecordCounter< ALLOC >* RecordCounter< ALLOC >::clone() const {
176  return clone(this->getAllocator());
177  }
178 
179 
180  /// destructor
181  template < template < typename > class ALLOC >
184  }
185 
186 
187  /// copy operator
188  template < template < typename > class ALLOC >
189  RecordCounter< ALLOC >&
191  if (this != &from) {
202  }
203  return *this;
204  }
205 
206 
207  /// move operator
208  template < template < typename > class ALLOC >
209  RecordCounter< ALLOC >&
211  if (this != &from) {
222  }
223  return *this;
224  }
225 
226 
227  /// clears all the last database-parsed countings from memory
228  template < template < typename > class ALLOC >
229  void RecordCounter< ALLOC >::clear() {
234  }
235 
236 
237  /// changes the max number of threads used to parse the database
238  template < template < typename > class ALLOC >
239  void RecordCounter< ALLOC >::setMaxNbThreads(const std::size_t nb) const {
240  if (nb == std::size_t(0) || !isOMP())
242  else
244  }
245 
246 
247  /// returns the number of threads used to parse the database
248  template < template < typename > class ALLOC >
250  return max_nb_threads__;
251  }
252 
253 
254  // changes the number min of rows a thread should process in a
255  // multithreading context
256  template < template < typename > class ALLOC >
257  void
259  if (nb == std::size_t(0))
261  else
263  }
264 
265 
266  /// returns the minimum of rows that each thread should process
267  template < template < typename > class ALLOC >
270  }
271 
272 
273  /// compute and raise the exception when some variables are continuous
274  template < template < typename > class ALLOC >
276  const std::vector< std::string, ALLOC< std::string > >& bad_vars) const {
277  // generate the exception
279  msg << "Counts cannot be performed on continuous variables. ";
280  msg << "Unfortunately the following variable";
281  if (bad_vars.size() == 1)
282  msg << " is continuous: " << bad_vars[0];
283  else {
284  msg << "s are continuous: ";
285  bool deja = false;
286  for (const auto& name: bad_vars) {
287  if (deja)
288  msg << ", ";
289  else
290  deja = true;
291  msg << name;
292  }
293  }
295  }
296 
297 
298  /// check that all the variables in an idset are discrete
299  template < template < typename > class ALLOC >
301  const IdCondSet< ALLOC >& ids) const {
302  const std::size_t size = ids.size();
304 
305  if (nodeId2columns__.empty()) {
306  // check all the ids
307  for (std::size_t i = std::size_t(0); i < size; ++i) {
309  // here, var i does not correspond to a discrete variable.
310  // we check whether there are other non discrete variables, so that
311  // we can generate an exception mentioning all these variables
313  database.variable(i).name()};
314  for (++i; i < size; ++i) {
317  }
319  }
320  }
321  } else {
322  // check all the ids
323  for (std::size_t i = std::size_t(0); i < size; ++i) {
324  // get the position of the variable in the database
326 
328  // here, id does not correspond to a discrete variable.
329  // we check whether there are other non discrete variables, so that
330  // we can generate an exception mentioning all these variables
333  for (++i; i < size; ++i) {
337  }
339  }
340  }
341  }
342  }
343 
344 
345  /// returns the mapping from ids to column positions in the database
346  template < template < typename > class ALLOC >
347  INLINE const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
348  RecordCounter< ALLOC >::nodeId2Columns() const {
349  return nodeId2columns__;
350  }
351 
352 
353  /// returns the database on which we perform the counts
354  template < template < typename > class ALLOC >
355  const DatabaseTable< ALLOC >& RecordCounter< ALLOC >::database() const {
356  return parsers__[0].data.database();
357  }
358 
359 
360  /// returns the counts for a given set of nodes
361  template < template < typename > class ALLOC >
362  INLINE const std::vector< double, ALLOC< double > >&
364  const bool check_discrete_vars) {
365  // if the idset is empty, return an empty vector
366  if (ids.empty()) {
369  return last_nonDB_countings__;
370  }
371 
372  // check whether we can extract the vector we wish to return from
373  // some already computed counting vector
378  else if (last_DB_ids__.contains(ids))
380  else {
382  return countFromDatabase__(ids);
383  }
384  }
385 
386 
387  // returns a mapping from the nodes ids to the columns of the database
388  // for a given sequence of ids
389  template < template < typename > class ALLOC >
391  const IdCondSet< ALLOC >& ids) const {
393  if (nodeId2columns__.empty()) {
394  for (const auto id: ids) {
395  res.insert(id, std::size_t(id));
396  }
397  } else {
398  for (const auto id: ids) {
400  }
401  }
402  return res;
403  }
404 
405 
406  /// extracts some new countings from previously computed ones
407  template < template < typename > class ALLOC >
408  INLINE std::vector< double, ALLOC< double > >&
410  const IdCondSet< ALLOC >& subset_ids,
411  const IdCondSet< ALLOC >& superset_ids,
412  const std::vector< double, ALLOC< double > >& superset_vect) {
413  // get a mapping between the node Ids and their columns in the database.
414  // This should be stored into nodeId2columns__, except if the latter is
415  // empty, in which case there is an identity mapping
417 
418  // we first determine the size of the output vector, the domain of
419  // each of its variables and their offsets in the output vector
420  const auto& database = parsers__[0].data.database();
422  for (const auto id: subset_ids) {
424  }
425 
426  // we create the output vector
428  std::vector< double, ALLOC< double > > result_vect(result_vect_size, 0.0);
429 
430 
431  // check if the subset_ids is the beginning of the sequence of superset_ids
432  // if this is the case, then we can outer loop over the variables not in
433  // subset_ids and, for each iteration of this loop add a vector of size
434  // result_size to result_vect
435  bool subset_begin = true;
436  for (std::size_t i = 0; i < subset_ids_size; ++i) {
437  if (superset_ids.pos(subset_ids[i]) != i) {
438  subset_begin = false;
439  break;
440  }
441  }
442 
443  if (subset_begin) {
445  std::size_t i = std::size_t(0);
446  while (i < superset_vect_size) {
447  for (std::size_t j = std::size_t(0); j < result_vect_size; ++j, ++i) {
449  }
450  }
451 
452  // save the subset_ids and the result vector
453  try {
456  return last_nonDB_countings__;
457  } catch (...) {
460  throw;
461  }
462  }
463 
464 
465  // check if subset_ids is the end of the sequence of superset_ids.
466  // In this case, as above, there are two simple loops to perform the
467  // countings
468  bool subset_end = true;
470  for (std::size_t i = 0; i < subset_ids_size; ++i) {
473  subset_end = false;
474  break;
475  }
476  }
477 
478  if (subset_end) {
479  // determine the size of the vector corresponding to the variables
480  // not belonging to subset_ids
482  for (std::size_t i = std::size_t(0);
484  ++i)
487 
488  // perform the two loops
489  std::size_t i = std::size_t(0);
490  for (std::size_t j = std::size_t(0); j < result_vect_size; ++j) {
491  for (std::size_t k = std::size_t(0); k < vect_not_subset_size;
492  ++k, ++i) {
494  }
495  }
496 
497  // save the subset_ids and the result vector
498  try {
501  return last_nonDB_countings__;
502  } catch (...) {
505  throw;
506  }
507  }
508 
509 
510  // here subset_ids is a subset of superset_ids neither prefixing nor
511  // postfixing it. So the computation is somewhat more complicated.
512 
513  // We will parse the superset_vect sequentially (using ++ operator).
514  // Sometimes, we will need to change the offset of the cell of result_vect
515  // that will be affected, sometimes not. Vector before_incr will indicate
516  // whether we need to change the offset (value = 0) or not (value different
517  // from 0). Vectors result_domain will indicate how this offset should be
518  // computed. Here is an example of the values of these vectors. Assume that
519  // superset_ids = <A,B,C,D,E> and subset_ids = <A,D,C>. Then, the three
520  // vectors before_incr, result_domain and result_offset are indexed w.r.t.
521  // A,C,D, i.e., w.r.t. to the variables in subset_ids but order w.r.t.
522  // superset_ids (this is convenient as we will parse superset_vect
523  // sequentially. For a variable or a set of variables X, let M_X denote the
524  // domain size of X. Then the contents of the three vectors are as follows:
525  // before_incr = {0, M_B, 0} (this means that whenever we iterate over B's
526  // values, the offset in result_vect does not change)
527  // result_domain = { M_A, M_C, M_D } (i.e., the domain sizes of the variables
528  // in subset_ids, order w.r.t. superset_ids)
529  // result_offset = { 1, M_A*M_D, M_A } (this corresponds to the offsets
530  // in result_vect of variables A, C and D)
531  // Vector superset_order = { 0, 2, 1} : this is a map from the indices of
532  // the variables in subset_ids to the indices of these variables in the
533  // three vectors described above. For instance, the "2" means that variable
534  // D (which is at index 1 in subset_ids) is located at index 2 in vector
535  // before_incr
539  {
543 
544  for (std::size_t h = std::size_t(0), j = std::size_t(0);
545  j < subset_ids_size;
546  ++h) {
550  tmp_before_incr = 1;
551  ++j;
552  } else {
555  }
556  }
557 
558  // compute the offsets in the order of the superset_ids
559  for (std::size_t i = 0; i < subset_ids.size(); ++i) {
560  const std::size_t domain_size
562  const std::size_t j = superset_order[i];
566  }
567  }
568 
572 
573  for (std::size_t j = std::size_t(0); j < result_down.size(); ++j) {
574  result_down[j] *= (result_domain[j] - 1);
575  }
576 
577  // now we can loop over the superset_vect to fill result_vect
580  for (std::size_t h = std::size_t(0); h < superset_vect_size; ++h) {
582 
583  // update the offset of result_vect
584  for (std::size_t k = 0; k < current_incr.size(); ++k) {
585  // check if we need modify result_offset
586  if (current_incr[k]) {
587  --current_incr[k];
588  break;
589  }
590 
592 
593  // here we shall modify result_offset
594  --result_value[k];
595 
596  if (result_value[k]) {
598  break;
599  }
600 
603  }
604  }
605 
606  // save the subset_ids and the result vector
607  try {
610  return last_nonDB_countings__;
611  } catch (...) {
614  throw;
615  }
616  }
617 
618 
619  /// parse the database to produce new countings
620  template < template < typename > class ALLOC >
621  std::vector< double, ALLOC< double > >&
623  // if the ids vector is empty or the database is empty, return an
624  // empty vector
625  const auto& database = parsers__[0].data.database();
626  if (ids.empty() || database.empty() || thread_ranges__.empty()) {
629  return last_nonDB_countings__;
630  }
631 
632  // we translate the ids into their corresponding columns in the
633  // DatabaseTable
635 
636  // we first determine the size of the counting vector, the domain of
637  // each of its variables and their offsets in the output vector
638  const std::size_t ids_size = ids.size();
641  std::vector< std::pair< std::size_t, std::size_t >,
642  ALLOC< std::pair< std::size_t, std::size_t > > >
644  {
645  std::size_t i = std::size_t(0);
646  for (const auto id: ids) {
652  ++i;
653  }
654  }
655 
656  // we sort the columns and offsets by increasing column index. This
657  // may speed up threaded countings by improving the cacheline hits
659  cols_offsets.end(),
660  [](const std::pair< std::size_t, std::size_t >& a,
661  const std::pair< std::size_t, std::size_t >& b) -> bool {
662  return a.first < b.first;
663  });
664 
665  // create parsers if needed
667  const std::size_t nb_threads
669  while (parsers__.size() < nb_threads) {
672  }
673 
674  // set the columns of interest for each parser. This specifies to the
675  // parser which columns are used for the countings. This is important
676  // for parsers like the EM parser that complete unobserved variables.
678  for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
680  }
681  for (auto& parser: parsers__) {
683  }
684 
685  // allocate all the counting vectors, including that which will add
686  // all the results provided by the threads. We initialize once and
687  // for all these vectors with zeroes
688  std::vector< double, ALLOC< double > > counting_vect(counting_vect_size,
689  0.0);
690  std::vector< ThreadData< std::vector< double, ALLOC< double > > >,
691  ALLOC< ThreadData< std::vector< double, ALLOC< double > > > > >
693  nb_threads,
694  ThreadData< std::vector< double, ALLOC< double > > >(counting_vect));
695 
696  // launch the threads
697  // here we use openMP for launching the threads because, experimentally,
698  // it seems to provide results that are twice as fast as the results
699  // with the std::thread
700  for (std::size_t i = std::size_t(0); i < nb_ranges; i += nb_threads) {
701 # pragma omp parallel num_threads(int(nb_threads))
702  {
703  // get the number of the thread
705  if (this_thread + i < nb_ranges) {
709  std::vector< double, ALLOC< double > >& countings
711 
712  // parse the database
713  try {
714  while (parser.hasRows()) {
715  // get the observed rows
716  const DBRow< DBTranslatedValue >& row = parser.row();
717 
718  // fill the counts for the current row
719  std::size_t offset = std::size_t(0);
720  for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
722  * cols_offsets[i].second;
723  }
724 
725  countings[offset] += row.weight();
726  }
727  } catch (NotFound&) {} // this exception is raised by the row filter
728  // if the row generators create no output row
729  // from the last rows of the database
730  }
731  }
732  }
733 
734 
735  // add the counts to counting_vect
736  for (std::size_t k = std::size_t(0); k < nb_threads; ++k) {
737  const auto& thread_counting = thread_countings[k].data;
738  for (std::size_t r = std::size_t(0); r < counting_vect_size; ++r) {
740  }
741  }
742 
743  // save the final results
744  last_DB_ids__ = ids;
746 
747  return last_DB_countings__;
748  }
749 
750 
751  /// the method used by threads to produce countings by parsing the database
752  template < template < typename > class ALLOC >
754  const std::size_t begin,
755  const std::size_t end,
757  const std::vector< std::pair< std::size_t, std::size_t >,
758  ALLOC< std::pair< std::size_t, std::size_t > > >&
759  cols_offsets,
760  std::vector< double, ALLOC< double > >& countings) {
762 
763  try {
765  while (parser.hasRows()) {
766  // get the observed filtered rows
767  const DBRow< DBTranslatedValue >& row = parser.row();
768 
769  // fill the counts for the current row
770  std::size_t offset = std::size_t(0);
771  for (std::size_t i = std::size_t(0); i < nb_columns; ++i) {
772  offset
774  }
775 
776  countings[offset] += row.weight();
777  }
778  } catch (NotFound&) {} // this exception is raised by the row filter if the
779  // row generators create no output row from the last
780  // rows of the database
781  }
782 
783 
784  /// checks that the ranges passed in argument are ok or raise an exception
785  template < template < typename > class ALLOC >
786  template < template < typename > class XALLOC >
788  const std::vector< std::pair< std::size_t, std::size_t >,
789  XALLOC< std::pair< std::size_t, std::size_t > > >&
790  new_ranges) const {
791  const std::size_t dbsize = parsers__[0].data.database().nbRows();
792  std::vector< std::pair< std::size_t, std::size_t >,
793  ALLOC< std::pair< std::size_t, std::size_t > > >
795  for (const auto& range: new_ranges) {
796  if ((range.first >= range.second) || (range.second > dbsize)) {
798  }
799  }
800  if (!incorrect_ranges.empty()) {
802  str << "It is impossible to set the ranges because the following one";
803  if (incorrect_ranges.size() > 1)
804  str << "s are incorrect: ";
805  else
806  str << " is incorrect: ";
807  bool deja = false;
808  for (const auto& range: incorrect_ranges) {
809  if (deja)
810  str << ", ";
811  else
812  deja = true;
813  str << '[' << range.first << ';' << range.second << ')';
814  }
815 
817  }
818  }
819 
820 
821  /// sets the ranges within which each thread should perform its computations
822  template < template < typename > class ALLOC >
825 
826  // ensure that ranges__ contains the ranges asked by the user
827  bool add_range = false;
828  if (ranges__.empty()) {
829  const auto& database = parsers__[0].data.database();
831  std::pair< std::size_t, std::size_t >(std::size_t(0),
832  database.nbRows()));
833  add_range = true;
834  }
835 
836  // dispatch the ranges
837  for (const auto& range: ranges__) {
838  if (range.second > range.first) {
841  if (nb_threads < 1)
842  nb_threads = 1;
843  else if (nb_threads > max_nb_threads__)
847 
849  for (std::size_t i = std::size_t(0); i < nb_threads; ++i) {
851  if (rest_rows != std::size_t(0)) {
852  ++end_index;
853  --rest_rows;
854  }
858  }
859  }
860  }
861  if (add_range) ranges__.clear();
862 
863  // sort ranges by decreasing range size, so that if the number of
864  // ranges exceeds the number of threads allowed, we start a first round of
865  // threads with the highest range, then another round with lower ranges,
866  // and so on until all the ranges have been processed
869  [](const std::pair< std::size_t, std::size_t >& a,
870  const std::pair< std::size_t, std::size_t >& b) -> bool {
871  return (a.second - a.first) > (b.second - b.first);
872  });
873  }
874 
875 
876  /// sets new ranges to perform the countings
877  template < template < typename > class ALLOC >
878  template < template < typename > class XALLOC >
879  void RecordCounter< ALLOC >::setRanges(
880  const std::vector< std::pair< std::size_t, std::size_t >,
881  XALLOC< std::pair< std::size_t, std::size_t > > >&
882  new_ranges) {
883  // first, we check that all ranges are within the database's bounds
885 
886  // since the ranges are OK, save them and clear the counting caches
887  const std::size_t new_size = new_ranges.size();
888  std::vector< std::pair< std::size_t, std::size_t >,
889  ALLOC< std::pair< std::size_t, std::size_t > > >
890  ranges(new_size);
891  for (std::size_t i = std::size_t(0); i < new_size; ++i) {
894  }
895 
896  clear();
897  ranges__ = std::move(ranges);
898 
899  // dispatch the ranges to the threads
901  }
902 
903 
904  /// reset the ranges to the one range corresponding to the whole database
905  template < template < typename > class ALLOC >
906  void RecordCounter< ALLOC >::clearRanges() {
907  if (ranges__.empty()) return;
908  clear();
909  ranges__.clear();
911  }
912 
913 
914  /// returns the current ranges
915  template < template < typename > class ALLOC >
916  INLINE const std::vector< std::pair< std::size_t, std::size_t >,
917  ALLOC< std::pair< std::size_t, std::size_t > > >&
918  RecordCounter< ALLOC >::ranges() const {
919  return ranges__;
920  }
921 
922 
923  /// assign a new Bayes net to all the counter's generators depending on a BN
924  template < template < typename > class ALLOC >
925  template < typename GUM_SCALAR >
926  INLINE void
928  // remove the caches
929  clear();
930 
931  // assign the new BN
932  for (auto& xparser: parsers__) {
934  }
935  }
936 
937  } /* namespace learning */
938 
939 } /* namespace gum */
940 
941 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:669
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)