aGrUM  0.20.3
a C++ library for (probabilistic) graphical models
recordCounter_tpl.h
Go to the documentation of this file.
1 /**
2  *
3  * Copyright (c) 2005-2021 by Pierre-Henri WUILLEMIN(@LIP6) & Christophe GONZALES(@AMU)
4  * info_at_agrum_dot_org
5  *
6  * This library is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU Lesser General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public License
17  * along with this library. If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 
22 /** @file
23  * @brief The class that computes countings of observations from the database.
24  *
25  * @author Christophe GONZALES(@AMU) and Pierre-Henri WUILLEMIN(@LIP6)
26  */
27 
28 #include <agrum/tools/stattests/recordCounter.h>
29 
30 
31 #ifndef DOXYGEN_SHOULD_SKIP_THIS
32 
33 namespace gum {
34 
35  namespace learning {
36 
37 
38  /// returns the allocator used by the translator
39  template < template < typename > class ALLOC >
41  RecordCounter< ALLOC >::getAllocator() const {
42  return _parsers_.get_allocator();
43  }
44 
45 
46  /// default constructor
47  template < template < typename > class ALLOC >
50  const std::vector< std::pair< std::size_t, std::size_t >,
51  ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
53  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
57  // check that the columns in nodeId2columns do belong to the database
59  for (auto iter = nodeId2columns.cbegin(); iter != nodeId2columns.cend(); ++iter) {
60  if (iter.second() >= db_nb_cols) {
62  "the mapping between ids and database columns "
63  << "is incorrect because Column " << iter.second()
64  << " does not belong to the database.");
65  }
66  }
67 
68  // create the parsers. There should always be at least one parser
71  for (std::size_t i = std::size_t(0); i < _max_nb_threads_; ++i)
73 
74  // check that the ranges are within the bounds of the database and
75  // save them
78  for (const auto& range: ranges)
80 
81  // dispatch the ranges for the threads
83 
85  }
86 
87 
88  /// default constructor
89  template < template < typename > class ALLOC >
93  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
95  std::vector< std::pair< std::size_t, std::size_t >,
96  ALLOC< std::pair< std::size_t, std::size_t > > >(),
98  alloc) {}
99 
100 
101  /// copy constructor with a given allocator
102  template < template < typename > class ALLOC >
104  const RecordCounter< ALLOC >& from,
105  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
114  }
115 
116 
117  /// copy constructor
118  template < template < typename > class ALLOC >
121 
122 
123  /// move constructor with a given allocator
124  template < template < typename > class ALLOC >
126  RecordCounter< ALLOC >&& from,
127  const typename RecordCounter< ALLOC >::allocator_type& alloc) :
138  }
139 
140 
141  /// move constructor
142  template < template < typename > class ALLOC >
145 
146 
147  /// virtual copy constructor with a given allocator
148  template < template < typename > class ALLOC >
150  const typename RecordCounter< ALLOC >::allocator_type& alloc) const {
153  try {
155  } catch (...) {
157  throw;
158  }
159 
160  return new_counter;
161  }
162 
163 
164  /// virtual copy constructor
165  template < template < typename > class ALLOC >
166  RecordCounter< ALLOC >* RecordCounter< ALLOC >::clone() const {
167  return clone(this->getAllocator());
168  }
169 
170 
171  /// destructor
172  template < template < typename > class ALLOC >
175  }
176 
177 
178  /// copy operator
179  template < template < typename > class ALLOC >
181  if (this != &from) {
192  }
193  return *this;
194  }
195 
196 
197  /// move operator
198  template < template < typename > class ALLOC >
200  if (this != &from) {
211  }
212  return *this;
213  }
214 
215 
216  /// clears all the last database-parsed countings from memory
217  template < template < typename > class ALLOC >
218  void RecordCounter< ALLOC >::clear() {
223  }
224 
225 
226  /// changes the max number of threads used to parse the database
227  template < template < typename > class ALLOC >
228  void RecordCounter< ALLOC >::setMaxNbThreads(const std::size_t nb) const {
229  if (nb == std::size_t(0) || !isOMP())
231  else
233  }
234 
235 
236  /// returns the number of threads used to parse the database
237  template < template < typename > class ALLOC >
239  return _max_nb_threads_;
240  }
241 
242 
243  // changes the number min of rows a thread should process in a
244  // multithreading context
245  template < template < typename > class ALLOC >
246  void RecordCounter< ALLOC >::setMinNbRowsPerThread(const std::size_t nb) const {
247  if (nb == std::size_t(0))
249  else
251  }
252 
253 
254  /// returns the minimum of rows that each thread should process
255  template < template < typename > class ALLOC >
258  }
259 
260 
261  /// compute and raise the exception when some variables are continuous
262  template < template < typename > class ALLOC >
264  const std::vector< std::string, ALLOC< std::string > >& bad_vars) const {
265  // generate the exception
267  msg << "Counts cannot be performed on continuous variables. ";
268  msg << "Unfortunately the following variable";
269  if (bad_vars.size() == 1)
270  msg << " is continuous: " << bad_vars[0];
271  else {
272  msg << "s are continuous: ";
273  bool deja = false;
274  for (const auto& name: bad_vars) {
275  if (deja)
276  msg << ", ";
277  else
278  deja = true;
279  msg << name;
280  }
281  }
283  }
284 
285 
286  /// check that all the variables in an idset are discrete
287  template < template < typename > class ALLOC >
288  void RecordCounter< ALLOC >::_checkDiscreteVariables_(const IdCondSet< ALLOC >& ids) const {
289  const std::size_t size = ids.size();
291 
292  if (_nodeId2columns_.empty()) {
293  // check all the ids
294  for (std::size_t i = std::size_t(0); i < size; ++i) {
296  // here, var i does not correspond to a discrete variable.
297  // we check whether there are other non discrete variables, so that
298  // we can generate an exception mentioning all these variables
300  for (++i; i < size; ++i) {
303  }
305  }
306  }
307  } else {
308  // check all the ids
309  for (std::size_t i = std::size_t(0); i < size; ++i) {
310  // get the position of the variable in the database
312 
314  // here, id does not correspond to a discrete variable.
315  // we check whether there are other non discrete variables, so that
316  // we can generate an exception mentioning all these variables
319  for (++i; i < size; ++i) {
323  }
325  }
326  }
327  }
328  }
329 
330 
331  /// returns the mapping from ids to column positions in the database
332  template < template < typename > class ALLOC >
333  INLINE const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
334  RecordCounter< ALLOC >::nodeId2Columns() const {
335  return _nodeId2columns_;
336  }
337 
338 
339  /// returns the database on which we perform the counts
340  template < template < typename > class ALLOC >
341  const DatabaseTable< ALLOC >& RecordCounter< ALLOC >::database() const {
342  return _parsers_[0].data.database();
343  }
344 
345 
346  /// returns the counts for a given set of nodes
347  template < template < typename > class ALLOC >
348  INLINE const std::vector< double, ALLOC< double > >&
350  const bool check_discrete_vars) {
351  // if the idset is empty, return an empty vector
352  if (ids.empty()) {
355  return _last_nonDB_countings_;
356  }
357 
358  // check whether we can extract the vector we wish to return from
359  // some already computed counting vector
362  else if (_last_DB_ids_.contains(ids))
364  else {
366  return _countFromDatabase_(ids);
367  }
368  }
369 
370 
371  // returns a mapping from the nodes ids to the columns of the database
372  // for a given sequence of ids
373  template < template < typename > class ALLOC >
377  if (_nodeId2columns_.empty()) {
378  for (const auto id: ids) {
379  res.insert(id, std::size_t(id));
380  }
381  } else {
382  for (const auto id: ids) {
384  }
385  }
386  return res;
387  }
388 
389 
390  /// extracts some new countings from previously computed ones
391  template < template < typename > class ALLOC >
392  INLINE std::vector< double, ALLOC< double > >& RecordCounter< ALLOC >::_extractFromCountings_(
393  const IdCondSet< ALLOC >& subset_ids,
394  const IdCondSet< ALLOC >& superset_ids,
395  const std::vector< double, ALLOC< double > >& superset_vect) {
396  // get a mapping between the node Ids and their columns in the database.
397  // This should be stored into _nodeId2columns_, except if the latter is
398  // empty, in which case there is an identity mapping
400 
401  // we first determine the size of the output vector, the domain of
402  // each of its variables and their offsets in the output vector
403  const auto& database = _parsers_[0].data.database();
405  for (const auto id: subset_ids) {
407  }
408 
409  // we create the output vector
411  std::vector< double, ALLOC< double > > result_vect(result_vect_size, 0.0);
412 
413 
414  // check if the subset_ids is the beginning of the sequence of superset_ids
415  // if this is the case, then we can outer loop over the variables not in
416  // subset_ids and, for each iteration of this loop add a vector of size
417  // result_size to result_vect
418  bool subset_begin = true;
419  for (std::size_t i = 0; i < subset_ids_size; ++i) {
420  if (superset_ids.pos(subset_ids[i]) != i) {
421  subset_begin = false;
422  break;
423  }
424  }
425 
426  if (subset_begin) {
428  std::size_t i = std::size_t(0);
429  while (i < superset_vect_size) {
430  for (std::size_t j = std::size_t(0); j < result_vect_size; ++j, ++i) {
432  }
433  }
434 
435  // save the subset_ids and the result vector
436  try {
439  return _last_nonDB_countings_;
440  } catch (...) {
443  throw;
444  }
445  }
446 
447 
448  // check if subset_ids is the end of the sequence of superset_ids.
449  // In this case, as above, there are two simple loops to perform the
450  // countings
451  bool subset_end = true;
453  for (std::size_t i = 0; i < subset_ids_size; ++i) {
455  subset_end = false;
456  break;
457  }
458  }
459 
460  if (subset_end) {
461  // determine the size of the vector corresponding to the variables
462  // not belonging to subset_ids
464  for (std::size_t i = std::size_t(0); i < superset_ids_size - subset_ids_size; ++i)
466 
467  // perform the two loops
468  std::size_t i = std::size_t(0);
469  for (std::size_t j = std::size_t(0); j < result_vect_size; ++j) {
470  for (std::size_t k = std::size_t(0); k < vect_not_subset_size; ++k, ++i) {
472  }
473  }
474 
475  // save the subset_ids and the result vector
476  try {
479  return _last_nonDB_countings_;
480  } catch (...) {
483  throw;
484  }
485  }
486 
487 
488  // here subset_ids is a subset of superset_ids neither prefixing nor
489  // postfixing it. So the computation is somewhat more complicated.
490 
491  // We will parse the superset_vect sequentially (using ++ operator).
492  // Sometimes, we will need to change the offset of the cell of result_vect
493  // that will be affected, sometimes not. Vector before_incr will indicate
494  // whether we need to change the offset (value = 0) or not (value different
495  // from 0). Vectors result_domain will indicate how this offset should be
496  // computed. Here is an example of the values of these vectors. Assume that
497  // superset_ids = <A,B,C,D,E> and subset_ids = <A,D,C>. Then, the three
498  // vectors before_incr, result_domain and result_offset are indexed w.r.t.
499  // A,C,D, i.e., w.r.t. to the variables in subset_ids but order w.r.t.
500  // superset_ids (this is convenient as we will parse superset_vect
501  // sequentially. For a variable or a set of variables X, let M_X denote the
502  // domain size of X. Then the contents of the three vectors are as follows:
503  // before_incr = {0, M_B, 0} (this means that whenever we iterate over B's
504  // values, the offset in result_vect does not change)
505  // result_domain = { M_A, M_C, M_D } (i.e., the domain sizes of the variables
506  // in subset_ids, order w.r.t. superset_ids)
507  // result_offset = { 1, M_A*M_D, M_A } (this corresponds to the offsets
508  // in result_vect of variables A, C and D)
509  // Vector superset_order = { 0, 2, 1} : this is a map from the indices of
510  // the variables in subset_ids to the indices of these variables in the
511  // three vectors described above. For instance, the "2" means that variable
512  // D (which is at index 1 in subset_ids) is located at index 2 in vector
513  // before_incr
517  {
521 
522  for (std::size_t h = std::size_t(0), j = std::size_t(0); j < subset_ids_size; ++h) {
526  tmp_before_incr = 1;
527  ++j;
528  } else {
530  }
531  }
532 
533  // compute the offsets in the order of the superset_ids
534  for (std::size_t i = 0; i < subset_ids.size(); ++i) {
536  const std::size_t j = superset_order[i];
540  }
541  }
542 
546 
547  for (std::size_t j = std::size_t(0); j < result_down.size(); ++j) {
548  result_down[j] *= (result_domain[j] - 1);
549  }
550 
551  // now we can loop over the superset_vect to fill result_vect
554  for (std::size_t h = std::size_t(0); h < superset_vect_size; ++h) {
556 
557  // update the offset of result_vect
558  for (std::size_t k = 0; k < current_incr.size(); ++k) {
559  // check if we need modify result_offset
560  if (current_incr[k]) {
561  --current_incr[k];
562  break;
563  }
564 
566 
567  // here we shall modify result_offset
568  --result_value[k];
569 
570  if (result_value[k]) {
572  break;
573  }
574 
577  }
578  }
579 
580  // save the subset_ids and the result vector
581  try {
584  return _last_nonDB_countings_;
585  } catch (...) {
588  throw;
589  }
590  }
591 
592 
593  /// parse the database to produce new countings
594  template < template < typename > class ALLOC >
595  std::vector< double, ALLOC< double > >&
597  // if the ids vector is empty or the database is empty, return an
598  // empty vector
599  const auto& database = _parsers_[0].data.database();
600  if (ids.empty() || database.empty() || _thread_ranges_.empty()) {
603  return _last_nonDB_countings_;
604  }
605 
606  // we translate the ids into their corresponding columns in the
607  // DatabaseTable
609 
610  // we first determine the size of the counting vector, the domain of
611  // each of its variables and their offsets in the output vector
612  const std::size_t ids_size = ids.size();
615  std::vector< std::pair< std::size_t, std::size_t >,
616  ALLOC< std::pair< std::size_t, std::size_t > > >
618  {
619  std::size_t i = std::size_t(0);
620  for (const auto id: ids) {
626  ++i;
627  }
628  }
629 
630  // we sort the columns and offsets by increasing column index. This
631  // may speed up threaded countings by improving the cacheline hits
632  std::sort(
634  cols_offsets.end(),
635  [](const std::pair< std::size_t, std::size_t >& a,
636  const std::pair< std::size_t, std::size_t >& b) -> bool { return a.first < b.first; });
637 
638  // create parsers if needed
641  while (_parsers_.size() < nb_threads) {
644  }
645 
646  // set the columns of interest for each parser. This specifies to the
647  // parser which columns are used for the countings. This is important
648  // for parsers like the EM parser that complete unobserved variables.
650  for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
652  }
653  for (auto& parser: _parsers_) {
655  }
656 
657  // allocate all the counting vectors, including that which will add
658  // all the results provided by the threads. We initialize once and
659  // for all these vectors with zeroes
660  std::vector< double, ALLOC< double > > counting_vect(counting_vect_size, 0.0);
661  std::vector< ThreadData< std::vector< double, ALLOC< double > > >,
662  ALLOC< ThreadData< std::vector< double, ALLOC< double > > > > >
664  ThreadData< std::vector< double, ALLOC< double > > >(counting_vect));
665 
666  // launch the threads
667  // here we use openMP for launching the threads because, experimentally,
668  // it seems to provide results that are twice as fast as the results
669  // with the std::thread
670  for (std::size_t i = std::size_t(0); i < nb_ranges; i += nb_threads) {
671 # pragma omp parallel num_threads(int(nb_threads))
672  {
673  // get the number of the thread
675  if (this_thread + i < nb_ranges) {
679  std::vector< double, ALLOC< double > >& countings = thread_countings[this_thread].data;
680 
681  // parse the database
682  try {
683  while (parser.hasRows()) {
684  // get the observed rows
685  const DBRow< DBTranslatedValue >& row = parser.row();
686 
687  // fill the counts for the current row
688  std::size_t offset = std::size_t(0);
689  for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
691  }
692 
693  countings[offset] += row.weight();
694  }
695  } catch (NotFound&) {} // this exception is raised by the row filter
696  // if the row generators create no output row
697  // from the last rows of the database
698  }
699  }
700  }
701 
702 
703  // add the counts to counting_vect
704  for (std::size_t k = std::size_t(0); k < nb_threads; ++k) {
705  const auto& thread_counting = thread_countings[k].data;
706  for (std::size_t r = std::size_t(0); r < counting_vect_size; ++r) {
708  }
709  }
710 
711  // save the final results
712  _last_DB_ids_ = ids;
714 
715  return _last_DB_countings_;
716  }
717 
718 
719  /// the method used by threads to produce countings by parsing the database
720  template < template < typename > class ALLOC >
722  const std::size_t begin,
723  const std::size_t end,
725  const std::vector< std::pair< std::size_t, std::size_t >,
726  ALLOC< std::pair< std::size_t, std::size_t > > >& cols_offsets,
727  std::vector< double, ALLOC< double > >& countings) {
729 
730  try {
732  while (parser.hasRows()) {
733  // get the observed filtered rows
734  const DBRow< DBTranslatedValue >& row = parser.row();
735 
736  // fill the counts for the current row
737  std::size_t offset = std::size_t(0);
738  for (std::size_t i = std::size_t(0); i < nb_columns; ++i) {
740  }
741 
742  countings[offset] += row.weight();
743  }
744  } catch (NotFound&) {} // this exception is raised by the row filter if the
745  // row generators create no output row from the last
746  // rows of the database
747  }
748 
749 
750  /// checks that the ranges passed in argument are ok or raise an exception
751  template < template < typename > class ALLOC >
752  template < template < typename > class XALLOC >
754  const std::vector< std::pair< std::size_t, std::size_t >,
755  XALLOC< std::pair< std::size_t, std::size_t > > >& new_ranges) const {
756  const std::size_t dbsize = _parsers_[0].data.database().nbRows();
757  std::vector< std::pair< std::size_t, std::size_t >,
758  ALLOC< std::pair< std::size_t, std::size_t > > >
760  for (const auto& range: new_ranges) {
761  if ((range.first >= range.second) || (range.second > dbsize)) {
763  }
764  }
765  if (!incorrect_ranges.empty()) {
767  str << "It is impossible to set the ranges because the following one";
768  if (incorrect_ranges.size() > 1)
769  str << "s are incorrect: ";
770  else
771  str << " is incorrect: ";
772  bool deja = false;
773  for (const auto& range: incorrect_ranges) {
774  if (deja)
775  str << ", ";
776  else
777  deja = true;
778  str << '[' << range.first << ';' << range.second << ')';
779  }
780 
782  }
783  }
784 
785 
786  /// sets the ranges within which each thread should perform its computations
787  template < template < typename > class ALLOC >
790 
791  // ensure that _ranges_ contains the ranges asked by the user
792  bool add_range = false;
793  if (_ranges_.empty()) {
794  const auto& database = _parsers_[0].data.database();
796  std::pair< std::size_t, std::size_t >(std::size_t(0), database.nbRows()));
797  add_range = true;
798  }
799 
800  // dispatch the ranges
801  for (const auto& range: _ranges_) {
802  if (range.second > range.first) {
805  if (nb_threads < 1)
806  nb_threads = 1;
807  else if (nb_threads > _max_nb_threads_)
811 
813  for (std::size_t i = std::size_t(0); i < nb_threads; ++i) {
815  if (rest_rows != std::size_t(0)) {
816  ++end_index;
817  --rest_rows;
818  }
822  }
823  }
824  }
825  if (add_range) _ranges_.clear();
826 
827  // sort ranges by decreasing range size, so that if the number of
828  // ranges exceeds the number of threads allowed, we start a first round of
829  // threads with the highest range, then another round with lower ranges,
830  // and so on until all the ranges have been processed
833  [](const std::pair< std::size_t, std::size_t >& a,
834  const std::pair< std::size_t, std::size_t >& b) -> bool {
835  return (a.second - a.first) > (b.second - b.first);
836  });
837  }
838 
839 
840  /// sets new ranges to perform the countings
841  template < template < typename > class ALLOC >
842  template < template < typename > class XALLOC >
843  void RecordCounter< ALLOC >::setRanges(
844  const std::vector< std::pair< std::size_t, std::size_t >,
845  XALLOC< std::pair< std::size_t, std::size_t > > >& new_ranges) {
846  // first, we check that all ranges are within the database's bounds
848 
849  // since the ranges are OK, save them and clear the counting caches
850  const std::size_t new_size = new_ranges.size();
851  std::vector< std::pair< std::size_t, std::size_t >,
852  ALLOC< std::pair< std::size_t, std::size_t > > >
853  ranges(new_size);
854  for (std::size_t i = std::size_t(0); i < new_size; ++i) {
857  }
858 
859  clear();
860  _ranges_ = std::move(ranges);
861 
862  // dispatch the ranges to the threads
864  }
865 
866 
867  /// reset the ranges to the one range corresponding to the whole database
868  template < template < typename > class ALLOC >
869  void RecordCounter< ALLOC >::clearRanges() {
870  if (_ranges_.empty()) return;
871  clear();
872  _ranges_.clear();
874  }
875 
876 
877  /// returns the current ranges
878  template < template < typename > class ALLOC >
879  INLINE const std::vector< std::pair< std::size_t, std::size_t >,
880  ALLOC< std::pair< std::size_t, std::size_t > > >&
881  RecordCounter< ALLOC >::ranges() const {
882  return _ranges_;
883  }
884 
885 
886  /// assign a new Bayes net to all the counter's generators depending on a BN
887  template < template < typename > class ALLOC >
888  template < typename GUM_SCALAR >
890  // remove the caches
891  clear();
892 
893  // assign the new BN
894  for (auto& xparser: _parsers_) {
896  }
897  }
898 
899  } /* namespace learning */
900 
901 } /* namespace gum */
902 
903 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
INLINE void emplace(Args &&... args)
Definition: set_tpl.h:643
Database(const std::string &filename, const BayesNet< GUM_SCALAR > &bn, const std::vector< std::string > &missing_symbols)