32 #ifndef DOXYGEN_SHOULD_SKIP_THIS 40 template <
template <
typename >
class ALLOC >
43 return __parsers.get_allocator();
48 template <
template <
typename >
class ALLOC >
50 const DBRowGeneratorParser< ALLOC >& parser,
51 const std::vector< std::pair< std::size_t, std::size_t >,
52 ALLOC< std::pair< std::size_t, std::size_t > > >& ranges,
53 const Bijection<
NodeId, std::size_t, ALLOC< std::size_t > >&
57 __ranges(alloc), __nodeId2columns(nodeId2columns),
58 __last_DB_countings(alloc), __last_DB_ids(alloc),
59 __last_nonDB_countings(alloc), __last_nonDB_ids(alloc) {
61 const std::size_t db_nb_cols = parser.database().nbVariables();
62 for (
auto iter = nodeId2columns.cbegin(); iter != nodeId2columns.cend();
64 if (iter.second() >= db_nb_cols) {
66 "the mapping between ids and database columns " 67 <<
"is incorrect because Column " << iter.second()
68 <<
" does not belong to the database.");
73 if (__max_nb_threads < std::size_t(1)) __max_nb_threads = std::size_t(1);
74 __parsers.reserve(__max_nb_threads);
75 for (std::size_t i = std::size_t(0); i < __max_nb_threads; ++i)
76 __parsers.push_back(parser);
81 __ranges.reserve(
ranges.size());
82 for (
const auto& range :
ranges)
83 __ranges.push_back(range);
86 __dispatchRangesToThreads();
93 template <
template <
typename >
class ALLOC >
95 const DBRowGeneratorParser< ALLOC >& parser,
96 const Bijection<
NodeId, std::size_t, ALLOC< std::size_t > >&
102 ALLOC<
std::pair<
std::size_t,
std::size_t > > >(),
108 template <
template <
typename >
class ALLOC >
110 const RecordCounter< ALLOC >& from,
112 __parsers(from.__parsers, alloc),
113 __ranges(from.__ranges, alloc),
114 __thread_ranges(from.__thread_ranges, alloc),
115 __nodeId2columns(from.__nodeId2columns),
116 __last_DB_countings(from.__last_DB_countings, alloc),
117 __last_DB_ids(from.__last_DB_ids),
118 __last_nonDB_countings(from.__last_nonDB_countings, alloc),
119 __last_nonDB_ids(from.__last_nonDB_ids),
120 __max_nb_threads(from.__max_nb_threads),
121 __min_nb_rows_per_thread(from.__min_nb_rows_per_thread) {
127 template <
template <
typename >
class ALLOC >
133 template <
template <
typename >
class ALLOC >
135 RecordCounter< ALLOC >&& from,
137 __parsers(
std::move(from.__parsers), alloc),
138 __ranges(
std::move(from.__ranges), alloc),
139 __thread_ranges(
std::move(from.__thread_ranges), alloc),
140 __nodeId2columns(
std::move(from.__nodeId2columns)),
141 __last_DB_countings(
std::move(from.__last_DB_countings), alloc),
142 __last_DB_ids(
std::move(from.__last_DB_ids)),
143 __last_nonDB_countings(
std::move(from.__last_nonDB_countings), alloc),
144 __last_nonDB_ids(
std::move(from.__last_nonDB_ids)),
145 __max_nb_threads(from.__max_nb_threads),
146 __min_nb_rows_per_thread(from.__min_nb_rows_per_thread) {
152 template <
template <
typename >
class ALLOC >
158 template <
template <
typename >
class ALLOC >
161 ALLOC< RecordCounter< ALLOC > > allocator(alloc);
162 RecordCounter< ALLOC >* new_counter = allocator.allocate(1);
164 allocator.construct(new_counter, *
this, alloc);
166 allocator.deallocate(new_counter, 1);
175 template <
template <
typename >
class ALLOC >
182 template <
template <
typename >
class ALLOC >
189 template <
template <
typename >
class ALLOC >
191 operator=(
const RecordCounter< ALLOC >& from) {
193 __parsers = from.__parsers;
194 __ranges = from.__ranges;
195 __thread_ranges = from.__thread_ranges;
196 __nodeId2columns = from.__nodeId2columns;
197 __last_DB_countings = from.__last_DB_countings;
198 __last_DB_ids = from.__last_DB_ids;
199 __last_nonDB_countings = from.__last_nonDB_countings;
200 __last_nonDB_ids = from.__last_nonDB_ids;
201 __max_nb_threads = from.__max_nb_threads;
202 __min_nb_rows_per_thread = from.__min_nb_rows_per_thread;
209 template <
template <
typename >
class ALLOC >
211 operator=(RecordCounter< ALLOC >&& from) {
213 __parsers = std::move(from.__parsers);
214 __ranges = std::move(from.__ranges);
215 __thread_ranges = std::move(from.__thread_ranges);
216 __nodeId2columns = std::move(from.__nodeId2columns);
217 __last_DB_countings = std::move(from.__last_DB_countings);
218 __last_DB_ids = std::move(from.__last_DB_ids);
219 __last_nonDB_countings = std::move(from.__last_nonDB_countings);
220 __last_nonDB_ids = std::move(from.__last_nonDB_ids);
221 __max_nb_threads = from.__max_nb_threads;
222 __min_nb_rows_per_thread = from.__min_nb_rows_per_thread;
229 template <
template <
typename >
class ALLOC >
231 __last_DB_countings.clear();
232 __last_DB_ids.clear();
233 __last_nonDB_countings.clear();
234 __last_nonDB_ids.clear();
239 template <
template <
typename >
class ALLOC >
241 if (nb == std::size_t(0) || !
isOMP())
242 __max_nb_threads = std::size_t(1);
244 __max_nb_threads = nb;
249 template <
template <
typename >
class ALLOC >
251 return __max_nb_threads;
257 template <
template <
typename >
class ALLOC >
260 if (nb == std::size_t(0))
261 __min_nb_rows_per_thread = std::size_t(1);
263 __min_nb_rows_per_thread = nb;
268 template <
template <
typename >
class ALLOC >
270 return __min_nb_rows_per_thread;
275 template <
template <
typename >
class ALLOC >
276 void RecordCounter< ALLOC >::__raiseCheckException(
277 const std::vector< std::string, ALLOC< std::string > >& bad_vars)
const {
279 std::stringstream msg;
280 msg <<
"Counts cannot be performed on continuous variables. ";
281 msg <<
"Unfortunately the following variable";
282 if (bad_vars.size() == 1)
283 msg <<
" is continuous: " << bad_vars[0];
285 msg <<
"s are continuous: ";
287 for (
const auto& name : bad_vars) {
300 template <
template <
typename >
class ALLOC >
301 void RecordCounter< ALLOC >::__checkDiscreteVariables(
302 const IdSet< ALLOC >& ids)
const {
303 const std::size_t size = ids.size();
304 const DatabaseTable< ALLOC >&
database = __parsers[0].data.database();
306 if (__nodeId2columns.empty()) {
308 for (std::size_t i = std::size_t(0); i < size; ++i) {
313 std::vector< std::string, ALLOC< std::string > > bad_vars{
314 database.variable(i).name()};
315 for (++i; i < size; ++i) {
317 bad_vars.push_back(database.variable(i).name());
319 __raiseCheckException(bad_vars);
324 for (std::size_t i = std::size_t(0); i < size; ++i) {
326 std::size_t pos = __nodeId2columns.second(ids[i]);
332 std::vector< std::string, ALLOC< std::string > > bad_vars{
333 database.variable(pos).name()};
334 for (++i; i < size; ++i) {
335 pos = __nodeId2columns.second(ids[i]);
337 bad_vars.push_back(database.variable(pos).name());
339 __raiseCheckException(bad_vars);
347 template <
template <
typename >
class ALLOC >
348 INLINE
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > >&
350 return __nodeId2columns;
355 template <
template <
typename >
class ALLOC >
357 return __parsers[0].data.database();
362 template <
template <
typename >
class ALLOC >
363 INLINE
const std::vector< double, ALLOC< double > >&
365 const bool check_discrete_vars) {
368 __last_nonDB_ids.clear();
369 __last_nonDB_countings.clear();
370 return __last_nonDB_countings;
375 if (__last_nonDB_ids.contains(ids))
376 return __extractFromCountings(
377 ids, __last_nonDB_ids, __last_nonDB_countings);
378 else if (__last_DB_ids.contains(ids))
379 return __extractFromCountings(ids, __last_DB_ids, __last_DB_countings);
381 if (check_discrete_vars) __checkDiscreteVariables(ids);
382 return __countFromDatabase(ids);
389 template <
template <
typename >
class ALLOC >
390 HashTable< NodeId, std::size_t > RecordCounter< ALLOC >::__getNodeIds2Columns(
391 const IdSet< ALLOC >& ids)
const {
392 HashTable< NodeId, std::size_t > res(ids.size());
393 if (__nodeId2columns.empty()) {
394 for (
const auto id : ids) {
395 res.insert(
id, std::size_t(
id));
398 for (
const auto id : ids) {
399 res.insert(
id, __nodeId2columns.second(
id));
407 template <
template <
typename >
class ALLOC >
408 INLINE std::vector< double, ALLOC< double > >&
409 RecordCounter< ALLOC >::__extractFromCountings(
410 const IdSet< ALLOC >& subset_ids,
411 const IdSet< ALLOC >& superset_ids,
412 const std::vector<
double, ALLOC< double > >& superset_vect) {
416 const auto nodeId2columns = __getNodeIds2Columns(superset_ids);
420 const auto& database = __parsers[0].data.database();
421 std::size_t result_vect_size = std::size_t(1);
422 for (
const auto id : subset_ids) {
423 result_vect_size *= database.domainSize(nodeId2columns[
id]);
427 const std::size_t subset_ids_size = std::size_t(subset_ids.size());
428 std::vector< double, ALLOC< double > > result_vect(result_vect_size, 0.0);
435 bool subset_begin =
true;
436 for (std::size_t i = 0; i < subset_ids_size; ++i) {
437 if (superset_ids.pos(subset_ids[i]) != i) {
438 subset_begin =
false;
444 const std::size_t superset_vect_size = superset_vect.size();
445 std::size_t i = std::size_t(0);
446 while (i < superset_vect_size) {
447 for (std::size_t j = std::size_t(0); j < result_vect_size; ++j, ++i) {
448 result_vect[j] += superset_vect[i];
454 __last_nonDB_ids = subset_ids;
455 __last_nonDB_countings = std::move(result_vect);
456 return __last_nonDB_countings;
458 __last_nonDB_ids.clear();
459 __last_nonDB_countings.clear();
468 bool subset_end =
true;
469 const std::size_t superset_ids_size = std::size_t(superset_ids.size());
470 for (std::size_t i = 0; i < subset_ids_size; ++i) {
471 if (superset_ids.pos(subset_ids[i])
472 != i + superset_ids_size - subset_ids_size) {
481 std::size_t vect_not_subset_size = std::size_t(1);
482 for (std::size_t i = std::size_t(0);
483 i < superset_ids_size - subset_ids_size;
485 vect_not_subset_size *=
486 database.domainSize(nodeId2columns[superset_ids[i]]);
489 std::size_t i = std::size_t(0);
490 for (std::size_t j = std::size_t(0); j < result_vect_size; ++j) {
491 for (std::size_t k = std::size_t(0); k < vect_not_subset_size;
493 result_vect[j] += superset_vect[i];
499 __last_nonDB_ids = subset_ids;
500 __last_nonDB_countings = std::move(result_vect);
501 return __last_nonDB_countings;
503 __last_nonDB_ids.clear();
504 __last_nonDB_countings.clear();
536 std::vector< std::size_t > before_incr(subset_ids_size);
537 std::vector< std::size_t > result_domain(subset_ids_size);
538 std::vector< std::size_t > result_offset(subset_ids_size);
540 std::size_t result_domain_size = std::size_t(1);
541 std::size_t tmp_before_incr = std::size_t(1);
542 std::vector< std::size_t > superset_order(subset_ids_size);
544 for (std::size_t h = std::size_t(0), j = std::size_t(0);
547 if (subset_ids.exists(superset_ids[h])) {
548 before_incr[j] = tmp_before_incr - 1;
549 superset_order[subset_ids.pos(superset_ids[h])] = j;
554 database.domainSize(nodeId2columns[superset_ids[h]]);
559 for (std::size_t i = 0; i < subset_ids.size(); ++i) {
560 const std::size_t domain_size =
561 database.domainSize(nodeId2columns[subset_ids[i]]);
562 const std::size_t j = superset_order[i];
563 result_domain[j] = domain_size;
564 result_offset[j] = result_domain_size;
565 result_domain_size *= domain_size;
569 std::vector< std::size_t > result_value(result_domain);
570 std::vector< std::size_t > current_incr(before_incr);
571 std::vector< std::size_t > result_down(result_offset);
573 for (std::size_t j = std::size_t(0); j < result_down.size(); ++j) {
574 result_down[j] *= (result_domain[j] - 1);
578 const std::size_t superset_vect_size = superset_vect.size();
579 std::size_t the_result_offset = std::size_t(0);
580 for (std::size_t h = std::size_t(0); h < superset_vect_size; ++h) {
581 result_vect[the_result_offset] += superset_vect[h];
584 for (std::size_t k = 0; k < current_incr.size(); ++k) {
586 if (current_incr[k]) {
591 current_incr[k] = before_incr[k];
596 if (result_value[k]) {
597 the_result_offset += result_offset[k];
601 result_value[k] = result_domain[k];
602 the_result_offset -= result_down[k];
608 __last_nonDB_ids = subset_ids;
609 __last_nonDB_countings = std::move(result_vect);
610 return __last_nonDB_countings;
612 __last_nonDB_ids.clear();
613 __last_nonDB_countings.clear();
620 template <
template <
typename >
class ALLOC >
621 std::vector< double, ALLOC< double > >&
622 RecordCounter< ALLOC >::__countFromDatabase(
const IdSet< ALLOC >& ids) {
625 const auto& database = __parsers[0].data.database();
626 if (ids.empty() || database.empty() || __thread_ranges.empty()) {
627 __last_nonDB_countings.clear();
628 __last_nonDB_ids.clear();
629 return __last_nonDB_countings;
634 const auto nodeId2columns = __getNodeIds2Columns(ids);
638 const std::size_t ids_size = ids.size();
639 std::size_t counting_vect_size = std::size_t(1);
640 std::vector< std::size_t, ALLOC< std::size_t > > domain_sizes(ids_size);
641 std::vector< std::pair< std::size_t, std::size_t >,
642 ALLOC< std::pair< std::size_t, std::size_t > > >
643 cols_offsets(ids_size);
645 std::size_t i = std::size_t(0);
646 for (
const auto id : ids) {
647 const std::size_t domain_size = database.domainSize(nodeId2columns[
id]);
648 domain_sizes[i] = domain_size;
649 cols_offsets[i].first = nodeId2columns[id];
650 cols_offsets[i].second = counting_vect_size;
651 counting_vect_size *= domain_size;
658 std::sort(cols_offsets.begin(),
660 [](
const std::pair< std::size_t, std::size_t >& a,
661 const std::pair< std::size_t, std::size_t >& b) ->
bool {
662 return a.first < b.first;
666 const std::size_t nb_ranges = __thread_ranges.size();
667 const std::size_t nb_threads =
668 nb_ranges <= __max_nb_threads ? nb_ranges : __max_nb_threads;
669 while (__parsers.size() < nb_threads) {
670 ThreadData< DBRowGeneratorParser< ALLOC > > new_parser(__parsers[0]);
671 __parsers.push_back(std::move(new_parser));
677 std::vector< std::size_t, ALLOC< std::size_t > > cols_of_interest(ids_size);
678 for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
679 cols_of_interest[i] = cols_offsets[i].first;
681 for (
auto& parser : __parsers) {
682 parser.data.setColumnsOfInterest(cols_of_interest);
688 std::vector< double, ALLOC< double > > counting_vect(counting_vect_size,
690 std::vector< ThreadData< std::vector< double, ALLOC< double > > >,
691 ALLOC< ThreadData< std::vector< double, ALLOC< double > > > > >
694 ThreadData< std::vector<
double, ALLOC< double > > >(counting_vect));
700 for (std::size_t i = std::size_t(0); i < nb_ranges; i += nb_threads) {
701 # pragma omp parallel num_threads(int(nb_threads)) 705 if (this_thread + i < nb_ranges) {
706 DBRowGeneratorParser< ALLOC >& parser = __parsers[this_thread].data;
707 parser.setRange(__thread_ranges[this_thread + i].first,
708 __thread_ranges[this_thread + i].second);
709 std::vector< double, ALLOC< double > >& countings =
710 thread_countings[this_thread].data;
714 while (parser.hasRows()) {
716 const DBRow< DBTranslatedValue >& row = parser.row();
719 std::size_t offset = std::size_t(0);
720 for (std::size_t i = std::size_t(0); i < ids_size; ++i) {
722 row[cols_offsets[i].first].discr_val * cols_offsets[i].second;
725 countings[offset] += row.weight();
727 }
catch (NotFound&) {}
736 for (std::size_t k = std::size_t(0); k < nb_threads; ++k) {
737 const auto& thread_counting = thread_countings[k].data;
738 for (std::size_t r = std::size_t(0); r < counting_vect_size; ++r) {
739 counting_vect[r] += thread_counting[r];
745 __last_DB_countings = std::move(counting_vect);
747 return __last_DB_countings;
752 template <
template <
typename >
class ALLOC >
753 void RecordCounter< ALLOC >::__threadedCount(
754 const std::size_t begin,
755 const std::size_t end,
756 DBRowGeneratorParser< ALLOC >& parser,
757 const std::vector< std::pair< std::size_t, std::size_t >,
758 ALLOC< std::pair< std::size_t, std::size_t > > >&
760 std::vector<
double, ALLOC< double > >& countings) {
761 parser.setRange(begin, end);
764 const std::size_t nb_columns = cols_offsets.size();
765 while (parser.hasRows()) {
767 const DBRow< DBTranslatedValue >& row = parser.row();
770 std::size_t offset = std::size_t(0);
771 for (std::size_t i = std::size_t(0); i < nb_columns; ++i) {
773 row[cols_offsets[i].first].discr_val * cols_offsets[i].second;
776 countings[offset] += row.weight();
778 }
catch (NotFound&) {}
785 template <
template <
typename >
class ALLOC >
786 template <
template <
typename >
class XALLOC >
787 void RecordCounter< ALLOC >::__checkRanges(
788 const std::vector< std::pair< std::size_t, std::size_t >,
789 XALLOC< std::pair< std::size_t, std::size_t > > >&
791 const std::size_t dbsize = __parsers[0].data.database().nbRows();
792 std::vector< std::pair< std::size_t, std::size_t >,
793 ALLOC< std::pair< std::size_t, std::size_t > > >
795 for (
const auto& range : new_ranges) {
796 if ((range.first >= range.second) || (range.second > dbsize)) {
797 incorrect_ranges.push_back(range);
800 if (!incorrect_ranges.empty()) {
801 std::stringstream str;
802 str <<
"It is impossible to set the ranges because the following one";
803 if (incorrect_ranges.size() > 1)
804 str <<
"s are incorrect: ";
806 str <<
" is incorrect: ";
808 for (
const auto& range : incorrect_ranges) {
813 str <<
'[' << range.first <<
';' << range.second <<
')';
822 template <
template <
typename >
class ALLOC >
823 void RecordCounter< ALLOC >::__dispatchRangesToThreads() {
824 __thread_ranges.clear();
827 bool add_range =
false;
828 if (__ranges.empty()) {
829 const auto& database = __parsers[0].data.database();
830 __ranges.push_back(std::pair< std::size_t, std::size_t >(
831 std::size_t(0), database.nbRows()));
836 for (
const auto& range : __ranges) {
837 if (range.second > range.first) {
838 const std::size_t range_size = range.second - range.first;
839 std::size_t nb_threads = range_size / __min_nb_rows_per_thread;
842 else if (nb_threads > __max_nb_threads)
843 nb_threads = __max_nb_threads;
844 std::size_t nb_rows_par_thread = range_size / nb_threads;
845 std::size_t rest_rows = range_size - nb_rows_par_thread * nb_threads;
847 std::size_t begin_index = range.first;
848 for (std::size_t i = std::size_t(0); i < nb_threads; ++i) {
849 std::size_t end_index = begin_index + nb_rows_par_thread;
850 if (rest_rows != std::size_t(0)) {
854 __thread_ranges.push_back(
855 std::pair< std::size_t, std::size_t >(begin_index, end_index));
856 begin_index = end_index;
860 if (add_range) __ranges.clear();
866 std::sort(__thread_ranges.begin(),
867 __thread_ranges.end(),
868 [](
const std::pair< std::size_t, std::size_t >& a,
869 const std::pair< std::size_t, std::size_t >& b) ->
bool {
870 return (a.second - a.first) > (b.second - b.first);
876 template <
template <
typename >
class ALLOC >
877 template <
template <
typename >
class XALLOC >
879 const std::vector< std::pair< std::size_t, std::size_t >,
880 XALLOC< std::pair< std::size_t, std::size_t > > >&
883 __checkRanges(new_ranges);
886 const std::size_t new_size = new_ranges.size();
887 std::vector< std::pair< std::size_t, std::size_t >,
888 ALLOC< std::pair< std::size_t, std::size_t > > >
890 for (std::size_t i = std::size_t(0); i < new_size; ++i) {
891 ranges[i].first = new_ranges[i].first;
892 ranges[i].second = new_ranges[i].second;
896 __ranges = std::move(ranges);
899 __dispatchRangesToThreads();
904 template <
template <
typename >
class ALLOC >
906 if (__ranges.empty())
return;
909 __dispatchRangesToThreads();
914 template <
template <
typename >
class ALLOC >
915 INLINE
const std::vector< std::pair< std::size_t, std::size_t >,
916 ALLOC< std::pair< std::size_t, std::size_t > > >&
923 template <
template <
typename >
class ALLOC >
924 template <
typename GUM_SCALAR >
931 for (
auto& xparser : __parsers) {
932 xparser.data.setBayesNet(new_bn);
bool isOMP()
Is OMP active ?
virtual RecordCounter< ALLOC > * clone() const
virtual copy constructor
unsigned int getThreadNumber()
Get the calling thread id.
const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > & nodeId2Columns() const
returns the mapping from ids to column positions in the database
void clearRanges()
reset the ranges to the one range corresponding to the whole database
std::size_t nbThreads() const
returns the number of threads used to parse the database
allocator_type getAllocator() const
returns the allocator used
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
void setMaxNbThreads(const std::size_t nb) const
changes the max number of threads used to parse the database
const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > & ranges() const
returns the current ranges
void clear()
clears all the last database-parsed countings from memory
std::size_t minNbRowsPerThread() const
returns the minimum of rows that each thread should process
Copyright 2005-2019 Pierre-Henri WUILLEMIN et Christophe GONZALES (LIP6) {prenom.nom}_at_lip6.fr.
const std::vector< double, ALLOC< double > > & counts(const IdSet< ALLOC > &ids, const bool check_discrete_vars=false)
returns the counts over all the variables in an IdSet
ALLOC< NodeId > allocator_type
type for the allocators passed in arguments of methods
void setMinNbRowsPerThread(const std::size_t nb) const
changes the number min of rows a thread should process in a multithreading context ...
virtual ~RecordCounter()
destructor
void setRanges(const std::vector< std::pair< std::size_t, std::size_t >, XALLOC< std::pair< std::size_t, std::size_t > > > &new_ranges)
sets new ranges to perform the countings
RecordCounter< ALLOC > & operator=(const RecordCounter< ALLOC > &from)
copy operator
RecordCounter(const DBRowGeneratorParser< ALLOC > &parser, const std::vector< std::pair< std::size_t, std::size_t >, ALLOC< std::pair< std::size_t, std::size_t > > > &ranges, const Bijection< NodeId, std::size_t, ALLOC< std::size_t > > &nodeId2columns=Bijection< NodeId, std::size_t, ALLOC< std::size_t > >(), const allocator_type &alloc=allocator_type())
default constructor
Size NodeId
Type for node ids.
#define GUM_ERROR(type, msg)
void setBayesNet(const BayesNet< GUM_SCALAR > &new_bn)
assign a new Bayes net to all the counter's generators depending on a BN
const DatabaseTable< ALLOC > & database() const
returns the database on which we perform the counts