xhistogram.hpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. /***************************************************************************
  2. * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
  3. * Copyright (c) QuantStack *
  4. * *
  5. * Distributed under the terms of the BSD 3-Clause License. *
  6. * *
  7. * The full license is in the file LICENSE, distributed with this software. *
  8. ****************************************************************************/
  9. /**
  10. * @brief construct histogram
  11. */
  12. #ifndef XTENSOR_HISTOGRAM_HPP
  13. #define XTENSOR_HISTOGRAM_HPP
  14. #include "xset_operation.hpp"
  15. #include "xsort.hpp"
  16. #include "xtensor.hpp"
  17. #include "xview.hpp"
  18. using namespace xt::placeholders;
  19. namespace xt
  20. {
  21. /**
  22. * @ingroup digitize
  23. * @brief Return the indices of the bins to which each value in input array belongs.
  24. *
  25. * @param data The data.
  26. * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
  27. * @param right Indicating whether the intervals include the right or the left bin edge.
  28. * @return Output array of indices, of same shape as x.
  29. */
  30. template <class E1, class E2>
  31. inline auto digitize(E1&& data, E2&& bin_edges, bool right = false)
  32. {
  33. XTENSOR_ASSERT(bin_edges.dimension() == 1);
  34. XTENSOR_ASSERT(bin_edges.size() >= 2);
  35. XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
  36. XTENSOR_ASSERT(xt::amin(data)[0] >= bin_edges[0]);
  37. XTENSOR_ASSERT(xt::amax(data)[0] <= bin_edges[bin_edges.size() - 1]);
  38. return xt::searchsorted(std::forward<E2>(bin_edges), std::forward<E1>(data), right);
  39. }
  40. namespace detail
  41. {
  42. template <class R = double, class E1, class E2, class E3>
  43. inline auto histogram_imp(E1&& data, E2&& bin_edges, E3&& weights, bool density, bool equal_bins)
  44. {
  45. using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>, std::decay_t<E3>>;
  46. using value_type = typename std::decay_t<E3>::value_type;
  47. XTENSOR_ASSERT(data.dimension() == 1);
  48. XTENSOR_ASSERT(weights.dimension() == 1);
  49. XTENSOR_ASSERT(bin_edges.dimension() == 1);
  50. XTENSOR_ASSERT(weights.size() == data.size());
  51. XTENSOR_ASSERT(bin_edges.size() >= 2);
  52. XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
  53. size_t n_bins = bin_edges.size() - 1;
  54. xt::xtensor<value_type, 1> count = xt::zeros<value_type>({n_bins});
  55. if (equal_bins)
  56. {
  57. std::array<typename std::decay_t<E2>::value_type, 2> bounds = xt::minmax(bin_edges)();
  58. auto left = static_cast<double>(bounds[0]);
  59. auto right = static_cast<double>(bounds[1]);
  60. double norm = 1. / (right - left);
  61. for (size_t i = 0; i < data.size(); ++i)
  62. {
  63. auto v = static_cast<double>(data(i));
  64. // left and right are not bounds of data
  65. if (v >= left && v < right)
  66. {
  67. auto i_bin = static_cast<size_t>(static_cast<double>(n_bins) * (v - left) * norm);
  68. count(i_bin) += weights(i);
  69. }
  70. else if (v == right)
  71. {
  72. count(n_bins - 1) += weights(i);
  73. }
  74. }
  75. }
  76. else
  77. {
  78. auto sorter = xt::argsort(data);
  79. size_type ibin = 0;
  80. for (auto& idx : sorter)
  81. {
  82. auto item = data[idx];
  83. if (item < bin_edges[0])
  84. {
  85. continue;
  86. }
  87. if (item > bin_edges[n_bins])
  88. {
  89. break;
  90. }
  91. while (item >= bin_edges[ibin + 1] && ibin < n_bins - 1)
  92. {
  93. ++ibin;
  94. }
  95. count[ibin] += weights[idx];
  96. }
  97. }
  98. xt::xtensor<R, 1> prob = xt::cast<R>(count);
  99. if (density)
  100. {
  101. R n = static_cast<R>(data.size());
  102. for (size_type i = 0; i < bin_edges.size() - 1; ++i)
  103. {
  104. prob[i] /= (static_cast<R>(bin_edges[i + 1] - bin_edges[i]) * n);
  105. }
  106. }
  107. return prob;
  108. }
  109. } // detail
  110. /**
  111. * @ingroup histogram
  112. * @brief Compute the histogram of a set of data.
  113. *
  114. * @param data The data.
  115. * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
  116. * @param weights Weight factors corresponding to each data-point.
  117. * @param density If true the resulting integral is normalized to 1. [default: false]
  118. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  119. */
  120. template <class R = double, class E1, class E2, class E3>
  121. inline auto histogram(E1&& data, E2&& bin_edges, E3&& weights, bool density = false)
  122. {
  123. return detail::histogram_imp<R>(
  124. std::forward<E1>(data),
  125. std::forward<E2>(bin_edges),
  126. std::forward<E3>(weights),
  127. density,
  128. false
  129. );
  130. }
  131. /**
  132. * @ingroup histogram
  133. * @brief Compute the histogram of a set of data.
  134. *
  135. * @param data The data.
  136. * @param bin_edges The bin-edges.
  137. * @param density If true the resulting integral is normalized to 1. [default: false]
  138. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  139. */
  140. template <class R = double, class E1, class E2>
  141. inline auto histogram(E1&& data, E2&& bin_edges, bool density = false)
  142. {
  143. using value_type = typename std::decay_t<E1>::value_type;
  144. auto n = data.size();
  145. return detail::histogram_imp<R>(
  146. std::forward<E1>(data),
  147. std::forward<E2>(bin_edges),
  148. xt::ones<value_type>({n}),
  149. density,
  150. false
  151. );
  152. }
  153. /**
  154. * @ingroup histogram
  155. * @brief Compute the histogram of a set of data.
  156. *
  157. * @param data The data.
  158. * @param bins The number of bins. [default: 10]
  159. * @param density If true the resulting integral is normalized to 1. [default: false]
  160. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  161. */
  162. template <class R = double, class E1>
  163. inline auto histogram(E1&& data, std::size_t bins = 10, bool density = false)
  164. {
  165. using value_type = typename std::decay_t<E1>::value_type;
  166. auto n = data.size();
  167. return detail::histogram_imp<R>(
  168. std::forward<E1>(data),
  169. histogram_bin_edges(data, xt::ones<value_type>({n}), bins),
  170. xt::ones<value_type>({n}),
  171. density,
  172. true
  173. );
  174. }
  175. /**
  176. * @ingroup histogram
  177. * @brief Compute the histogram of a set of data.
  178. *
  179. * @param data The data.
  180. * @param bins The number of bins.
  181. * @param left The lower-most edge.
  182. * @param right The upper-most edge.
  183. * @param density If true the resulting integral is normalized to 1. [default: false]
  184. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  185. */
  186. template <class R = double, class E1, class E2>
  187. inline auto histogram(E1&& data, std::size_t bins, E2 left, E2 right, bool density = false)
  188. {
  189. using value_type = typename std::decay_t<E1>::value_type;
  190. auto n = data.size();
  191. return detail::histogram_imp<R>(
  192. std::forward<E1>(data),
  193. histogram_bin_edges(data, left, right, bins),
  194. xt::ones<value_type>({n}),
  195. density,
  196. true
  197. );
  198. }
  199. /**
  200. * @ingroup histogram
  201. * @brief Compute the histogram of a set of data.
  202. *
  203. * @param data The data.
  204. * @param bins The number of bins.
  205. * @param weights Weight factors corresponding to each data-point.
  206. * @param density If true the resulting integral is normalized to 1. [default: false]
  207. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  208. */
  209. template <class R = double, class E1, class E2>
  210. inline auto histogram(E1&& data, std::size_t bins, E2&& weights, bool density = false)
  211. {
  212. return detail::histogram_imp<R>(
  213. std::forward<E1>(data),
  214. histogram_bin_edges(data, weights, bins),
  215. std::forward<E2>(weights),
  216. density,
  217. true
  218. );
  219. }
  220. /**
  221. * @ingroup histogram
  222. * @brief Compute the histogram of a set of data.
  223. *
  224. * @param data The data.
  225. * @param bins The number of bins.
  226. * @param left The lower-most edge.
  227. * @param right The upper-most edge.
  228. * @param weights Weight factors corresponding to each data-point.
  229. * @param density If true the resulting integral is normalized to 1. [default: false]
  230. * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
  231. */
  232. template <class R = double, class E1, class E2, class E3>
  233. inline auto histogram(E1&& data, std::size_t bins, E2&& weights, E3 left, E3 right, bool density = false)
  234. {
  235. return detail::histogram_imp<R>(
  236. std::forward<E1>(data),
  237. histogram_bin_edges(data, weights, left, right, bins),
  238. std::forward<E2>(weights),
  239. density,
  240. true
  241. );
  242. }
  243. /**
  244. * @ingroup histogram
  245. * @brief Defines different algorithms to be used in "histogram_bin_edges"
  246. */
  247. enum class histogram_algorithm
  248. {
  249. automatic,
  250. linspace,
  251. logspace,
  252. uniform
  253. };
  254. /**
  255. * @ingroup histogram
  256. * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
  257. *
  258. * @param data The data.
  259. * @param weights Weight factors corresponding to each data-point.
  260. * @param left The lower-most edge.
  261. * @param right The upper-most edge.
  262. * @param bins The number of bins. [default: 10]
  263. * @param mode The type of algorithm to use. [default: "auto"]
  264. * @return An one-dimensional xarray<double>, length: bins+1.
  265. */
  266. template <class E1, class E2, class E3>
  267. inline auto histogram_bin_edges(
  268. E1&& data,
  269. E2&& weights,
  270. E3 left,
  271. E3 right,
  272. std::size_t bins = 10,
  273. histogram_algorithm mode = histogram_algorithm::automatic
  274. )
  275. {
  276. // counter and return type
  277. using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>>;
  278. using value_type = typename std::decay_t<E1>::value_type;
  279. using weights_type = typename std::decay_t<E2>::value_type;
  280. // basic checks
  281. // - rank
  282. XTENSOR_ASSERT(data.dimension() == 1);
  283. XTENSOR_ASSERT(weights.dimension() == 1);
  284. // - size
  285. XTENSOR_ASSERT(weights.size() == data.size());
  286. // - bounds
  287. XTENSOR_ASSERT(left <= right);
  288. // - non-empty
  289. XTENSOR_ASSERT(bins > std::size_t(0));
  290. // act on different modes
  291. switch (mode)
  292. {
  293. // bins of equal width
  294. case histogram_algorithm::automatic:
  295. {
  296. xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
  297. return bin_edges;
  298. }
  299. // bins of equal width
  300. case histogram_algorithm::linspace:
  301. {
  302. xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
  303. return bin_edges;
  304. }
  305. // bins of logarithmically increasing size
  306. case histogram_algorithm::logspace:
  307. {
  308. using rhs_value_type = std::conditional_t<xtl::is_integral<value_type>::value, double, value_type>;
  309. xtensor<value_type, 1> bin_edges = xt::cast<value_type>(
  310. xt::logspace<rhs_value_type>(std::log10(left), std::log10(right), bins + 1)
  311. );
  312. // TODO: replace above with below after 'xsimd' fix
  313. // xt::xtensor<value_type,1> bin_edges = xt::logspace<value_type>(
  314. // std::log10(left), std::log10(right), bins+1);
  315. return bin_edges;
  316. }
  317. // same amount of data in each bin
  318. case histogram_algorithm::uniform:
  319. {
  320. // indices that sort "data"
  321. auto sorter = xt::argsort(data);
  322. // histogram: all of equal 'height'
  323. // - height
  324. weights_type w = xt::sum<weights_type>(weights)[0] / static_cast<weights_type>(bins);
  325. // - apply to all bins
  326. xt::xtensor<weights_type, 1> count = w * xt::ones<weights_type>({bins});
  327. // take cumulative sum, to act as easy look-up
  328. count = xt::cumsum(count);
  329. // edges
  330. // - allocate
  331. std::vector<size_t> shape = {bins + 1};
  332. xt::xtensor<value_type, 1> bin_edges = xtensor<value_type, 1>::from_shape(shape);
  333. // - first/last edge
  334. bin_edges[0] = left;
  335. bin_edges[bins] = right;
  336. // - cumulative weight
  337. weights_type cum_weight = static_cast<weights_type>(0);
  338. // - current bin
  339. size_type ibin = 0;
  340. // - loop to find interior bin-edges
  341. for (size_type i = 0; i < weights.size(); ++i)
  342. {
  343. if (cum_weight >= count[ibin])
  344. {
  345. bin_edges[ibin + 1] = data[sorter[i]];
  346. ++ibin;
  347. }
  348. cum_weight += weights[sorter[i]];
  349. }
  350. return bin_edges;
  351. }
  352. // bins of equal width
  353. default:
  354. {
  355. xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
  356. return bin_edges;
  357. }
  358. }
  359. }
  360. /**
  361. * @ingroup histogram
  362. * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
  363. *
  364. * @param data The data.
  365. * @param weights Weight factors corresponding to each data-point.
  366. * @param bins The number of bins. [default: 10]
  367. * @param mode The type of algorithm to use. [default: "auto"]
  368. * @return An one-dimensional xarray<double>, length: bins+1.
  369. */
  370. template <class E1, class E2>
  371. inline auto histogram_bin_edges(
  372. E1&& data,
  373. E2&& weights,
  374. std::size_t bins = 10,
  375. histogram_algorithm mode = histogram_algorithm::automatic
  376. )
  377. {
  378. using value_type = typename std::decay_t<E1>::value_type;
  379. std::array<value_type, 2> left_right;
  380. left_right = xt::minmax(data)();
  381. return histogram_bin_edges(
  382. std::forward<E1>(data),
  383. std::forward<E2>(weights),
  384. left_right[0],
  385. left_right[1],
  386. bins,
  387. mode
  388. );
  389. }
  390. /**
  391. * @ingroup histogram
  392. * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
  393. *
  394. * @param data The data.
  395. * @param bins The number of bins. [default: 10]
  396. * @param mode The type of algorithm to use. [default: "auto"]
  397. * @return An one-dimensional xarray<double>, length: bins+1.
  398. */
  399. template <class E1>
  400. inline auto
  401. histogram_bin_edges(E1&& data, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic)
  402. {
  403. using value_type = typename std::decay_t<E1>::value_type;
  404. auto n = data.size();
  405. std::array<value_type, 2> left_right;
  406. left_right = xt::minmax(data)();
  407. return histogram_bin_edges(
  408. std::forward<E1>(data),
  409. xt::ones<value_type>({n}),
  410. left_right[0],
  411. left_right[1],
  412. bins,
  413. mode
  414. );
  415. }
  416. /**
  417. * @ingroup histogram
  418. * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
  419. *
  420. * @param data The data.
  421. * @param left The lower-most edge.
  422. * @param right The upper-most edge.
  423. * @param bins The number of bins. [default: 10]
  424. * @param mode The type of algorithm to use. [default: "auto"]
  425. * @return An one-dimensional xarray<double>, length: bins+1.
  426. */
  427. template <class E1, class E2>
  428. inline auto histogram_bin_edges(
  429. E1&& data,
  430. E2 left,
  431. E2 right,
  432. std::size_t bins = 10,
  433. histogram_algorithm mode = histogram_algorithm::automatic
  434. )
  435. {
  436. using value_type = typename std::decay_t<E1>::value_type;
  437. auto n = data.size();
  438. return histogram_bin_edges(std::forward<E1>(data), xt::ones<value_type>({n}), left, right, bins, mode);
  439. }
  440. /**
  441. * Count number of occurrences of each value in array of non-negative ints.
  442. *
  443. * The number of bins (of size 1) is one larger than the largest value in x.
  444. * If minlength is specified, there will be at least this number of bins in
  445. * the output array (though it will be longer if necessary, depending on the
  446. * contents of x). Each bin gives the number of occurrences of its index
  447. * value in x. If weights is specified the input array is weighted by it,
  448. * i.e. if a value ``n`` is found at position ``i``, ``out[n] += weight[i]``
  449. * instead of ``out[n] += 1``.
  450. *
  451. * @param data the 1D container with integers to count into bins
  452. * @param weights a 1D container with the same number of elements as ``data``
  453. * @param minlength The minlength
  454. *
  455. * @return 1D container with the bincount
  456. */
  457. template <class E1, class E2, XTL_REQUIRES(is_xexpression<std::decay_t<E2>>)>
  458. inline auto bincount(E1&& data, E2&& weights, std::size_t minlength = 0)
  459. {
  460. using result_value_type = typename std::decay_t<E2>::value_type;
  461. using input_value_type = typename std::decay_t<E1>::value_type;
  462. using size_type = typename std::decay_t<E1>::size_type;
  463. static_assert(
  464. xtl::is_integral<typename std::decay_t<E1>::value_type>::value,
  465. "Bincount data has to be integral type."
  466. );
  467. XTENSOR_ASSERT(data.dimension() == 1);
  468. XTENSOR_ASSERT(weights.dimension() == 1);
  469. std::array<input_value_type, 2> left_right;
  470. left_right = xt::minmax(data)();
  471. if (left_right[0] < input_value_type(0))
  472. {
  473. XTENSOR_THROW(std::runtime_error, "Data argument for bincount can only contain positive integers!");
  474. }
  475. xt::xtensor<result_value_type, 1> res = xt::zeros<result_value_type>(
  476. {(std::max)(minlength, std::size_t(left_right[1] + 1))}
  477. );
  478. for (size_type i = 0; i < data.size(); ++i)
  479. {
  480. res(data(i)) += weights(i);
  481. }
  482. return res;
  483. }
  484. template <class E1>
  485. inline auto bincount(E1&& data, std::size_t minlength = 0)
  486. {
  487. return bincount(
  488. std::forward<E1>(data),
  489. xt::ones<typename std::decay_t<E1>::value_type>(data.shape()),
  490. minlength
  491. );
  492. }
  493. /**
  494. * Get the number of items in each bin, given the fraction of items per bin.
  495. * The output is such that the total number of items of all bins is exactly "N".
  496. *
  497. * @param N the number of items to distribute
  498. * @param weights fraction of items per bin: a 1D container whose size is the number of bins
  499. *
  500. * @return 1D container with the number of items per bin
  501. */
  502. template <class E>
  503. inline xt::xtensor<size_t, 1> bin_items(size_t N, E&& weights)
  504. {
  505. if (weights.size() <= std::size_t(1))
  506. {
  507. xt::xtensor<size_t, 1> n = N * xt::ones<size_t>({1});
  508. return n;
  509. }
  510. #ifdef XTENSOR_ENABLE_ASSERT
  511. using value_type = typename std::decay_t<E>::value_type;
  512. XTENSOR_ASSERT(xt::all(weights >= static_cast<value_type>(0)));
  513. XTENSOR_ASSERT(xt::sum(weights)() > static_cast<value_type>(0));
  514. #endif
  515. xt::xtensor<double, 1> P = xt::cast<double>(weights) / static_cast<double>(xt::sum(weights)());
  516. xt::xtensor<size_t, 1> n = xt::ceil(static_cast<double>(N) * P);
  517. if (xt::sum(n)() == N)
  518. {
  519. return n;
  520. }
  521. xt::xtensor<size_t, 1> d = xt::zeros<size_t>(P.shape());
  522. xt::xtensor<size_t, 1> sorter = xt::argsort(P);
  523. sorter = xt::view(sorter, xt::range(P.size(), _, -1));
  524. sorter = xt::view(sorter, xt::range(0, xt::sum(n)(0) - N));
  525. xt::view(d, xt::keep(sorter)) = 1;
  526. n -= d;
  527. return n;
  528. }
  529. /**
  530. * Get the number of items in each bin, with each bin having approximately the same number of
  531. * items in it,under the constraint that the total number of items of all bins is exactly "N".
  532. *
  533. * @param N the number of items to distribute
  534. * @param bins the number of bins
  535. *
  536. * @return 1D container with the number of items per bin
  537. */
  538. inline xt::xtensor<size_t, 1> bin_items(size_t N, size_t bins)
  539. {
  540. return bin_items(N, xt::ones<double>({bins}));
  541. }
  542. }
  543. #endif