| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614 |
- /***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay and Wolf Vollprecht *
- * Copyright (c) QuantStack *
- * *
- * Distributed under the terms of the BSD 3-Clause License. *
- * *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
- /**
- * @brief construct histogram
- */
- #ifndef XTENSOR_HISTOGRAM_HPP
- #define XTENSOR_HISTOGRAM_HPP
- #include "xset_operation.hpp"
- #include "xsort.hpp"
- #include "xtensor.hpp"
- #include "xview.hpp"
- using namespace xt::placeholders;
- namespace xt
- {
- /**
- * @ingroup digitize
- * @brief Return the indices of the bins to which each value in input array belongs.
- *
- * @param data The data.
- * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
- * @param right Indicating whether the intervals include the right or the left bin edge.
- * @return Output array of indices, of same shape as x.
- */
- template <class E1, class E2>
- inline auto digitize(E1&& data, E2&& bin_edges, bool right = false)
- {
- XTENSOR_ASSERT(bin_edges.dimension() == 1);
- XTENSOR_ASSERT(bin_edges.size() >= 2);
- XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
- XTENSOR_ASSERT(xt::amin(data)[0] >= bin_edges[0]);
- XTENSOR_ASSERT(xt::amax(data)[0] <= bin_edges[bin_edges.size() - 1]);
- return xt::searchsorted(std::forward<E2>(bin_edges), std::forward<E1>(data), right);
- }
- namespace detail
- {
- template <class R = double, class E1, class E2, class E3>
- inline auto histogram_imp(E1&& data, E2&& bin_edges, E3&& weights, bool density, bool equal_bins)
- {
- using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>, std::decay_t<E3>>;
- using value_type = typename std::decay_t<E3>::value_type;
- XTENSOR_ASSERT(data.dimension() == 1);
- XTENSOR_ASSERT(weights.dimension() == 1);
- XTENSOR_ASSERT(bin_edges.dimension() == 1);
- XTENSOR_ASSERT(weights.size() == data.size());
- XTENSOR_ASSERT(bin_edges.size() >= 2);
- XTENSOR_ASSERT(std::is_sorted(bin_edges.cbegin(), bin_edges.cend()));
- size_t n_bins = bin_edges.size() - 1;
- xt::xtensor<value_type, 1> count = xt::zeros<value_type>({n_bins});
- if (equal_bins)
- {
- std::array<typename std::decay_t<E2>::value_type, 2> bounds = xt::minmax(bin_edges)();
- auto left = static_cast<double>(bounds[0]);
- auto right = static_cast<double>(bounds[1]);
- double norm = 1. / (right - left);
- for (size_t i = 0; i < data.size(); ++i)
- {
- auto v = static_cast<double>(data(i));
- // left and right are not bounds of data
- if (v >= left && v < right)
- {
- auto i_bin = static_cast<size_t>(static_cast<double>(n_bins) * (v - left) * norm);
- count(i_bin) += weights(i);
- }
- else if (v == right)
- {
- count(n_bins - 1) += weights(i);
- }
- }
- }
- else
- {
- auto sorter = xt::argsort(data);
- size_type ibin = 0;
- for (auto& idx : sorter)
- {
- auto item = data[idx];
- if (item < bin_edges[0])
- {
- continue;
- }
- if (item > bin_edges[n_bins])
- {
- break;
- }
- while (item >= bin_edges[ibin + 1] && ibin < n_bins - 1)
- {
- ++ibin;
- }
- count[ibin] += weights[idx];
- }
- }
- xt::xtensor<R, 1> prob = xt::cast<R>(count);
- if (density)
- {
- R n = static_cast<R>(data.size());
- for (size_type i = 0; i < bin_edges.size() - 1; ++i)
- {
- prob[i] /= (static_cast<R>(bin_edges[i + 1] - bin_edges[i]) * n);
- }
- }
- return prob;
- }
- } // detail
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bin_edges The bin-edges. It has to be 1-dimensional and monotonic.
- * @param weights Weight factors corresponding to each data-point.
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1, class E2, class E3>
- inline auto histogram(E1&& data, E2&& bin_edges, E3&& weights, bool density = false)
- {
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- std::forward<E2>(bin_edges),
- std::forward<E3>(weights),
- density,
- false
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bin_edges The bin-edges.
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1, class E2>
- inline auto histogram(E1&& data, E2&& bin_edges, bool density = false)
- {
- using value_type = typename std::decay_t<E1>::value_type;
- auto n = data.size();
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- std::forward<E2>(bin_edges),
- xt::ones<value_type>({n}),
- density,
- false
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bins The number of bins. [default: 10]
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1>
- inline auto histogram(E1&& data, std::size_t bins = 10, bool density = false)
- {
- using value_type = typename std::decay_t<E1>::value_type;
- auto n = data.size();
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- histogram_bin_edges(data, xt::ones<value_type>({n}), bins),
- xt::ones<value_type>({n}),
- density,
- true
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bins The number of bins.
- * @param left The lower-most edge.
- * @param right The upper-most edge.
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1, class E2>
- inline auto histogram(E1&& data, std::size_t bins, E2 left, E2 right, bool density = false)
- {
- using value_type = typename std::decay_t<E1>::value_type;
- auto n = data.size();
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- histogram_bin_edges(data, left, right, bins),
- xt::ones<value_type>({n}),
- density,
- true
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bins The number of bins.
- * @param weights Weight factors corresponding to each data-point.
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1, class E2>
- inline auto histogram(E1&& data, std::size_t bins, E2&& weights, bool density = false)
- {
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- histogram_bin_edges(data, weights, bins),
- std::forward<E2>(weights),
- density,
- true
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the histogram of a set of data.
- *
- * @param data The data.
- * @param bins The number of bins.
- * @param left The lower-most edge.
- * @param right The upper-most edge.
- * @param weights Weight factors corresponding to each data-point.
- * @param density If true the resulting integral is normalized to 1. [default: false]
- * @return An one-dimensional xarray<double>, length: bin_edges.size()-1.
- */
- template <class R = double, class E1, class E2, class E3>
- inline auto histogram(E1&& data, std::size_t bins, E2&& weights, E3 left, E3 right, bool density = false)
- {
- return detail::histogram_imp<R>(
- std::forward<E1>(data),
- histogram_bin_edges(data, weights, left, right, bins),
- std::forward<E2>(weights),
- density,
- true
- );
- }
- /**
- * @ingroup histogram
- * @brief Defines different algorithms to be used in "histogram_bin_edges"
- */
- enum class histogram_algorithm
- {
- automatic,
- linspace,
- logspace,
- uniform
- };
- /**
- * @ingroup histogram
- * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
- *
- * @param data The data.
- * @param weights Weight factors corresponding to each data-point.
- * @param left The lower-most edge.
- * @param right The upper-most edge.
- * @param bins The number of bins. [default: 10]
- * @param mode The type of algorithm to use. [default: "auto"]
- * @return An one-dimensional xarray<double>, length: bins+1.
- */
- template <class E1, class E2, class E3>
- inline auto histogram_bin_edges(
- E1&& data,
- E2&& weights,
- E3 left,
- E3 right,
- std::size_t bins = 10,
- histogram_algorithm mode = histogram_algorithm::automatic
- )
- {
- // counter and return type
- using size_type = common_size_type_t<std::decay_t<E1>, std::decay_t<E2>>;
- using value_type = typename std::decay_t<E1>::value_type;
- using weights_type = typename std::decay_t<E2>::value_type;
- // basic checks
- // - rank
- XTENSOR_ASSERT(data.dimension() == 1);
- XTENSOR_ASSERT(weights.dimension() == 1);
- // - size
- XTENSOR_ASSERT(weights.size() == data.size());
- // - bounds
- XTENSOR_ASSERT(left <= right);
- // - non-empty
- XTENSOR_ASSERT(bins > std::size_t(0));
- // act on different modes
- switch (mode)
- {
- // bins of equal width
- case histogram_algorithm::automatic:
- {
- xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
- return bin_edges;
- }
- // bins of equal width
- case histogram_algorithm::linspace:
- {
- xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
- return bin_edges;
- }
- // bins of logarithmically increasing size
- case histogram_algorithm::logspace:
- {
- using rhs_value_type = std::conditional_t<xtl::is_integral<value_type>::value, double, value_type>;
- xtensor<value_type, 1> bin_edges = xt::cast<value_type>(
- xt::logspace<rhs_value_type>(std::log10(left), std::log10(right), bins + 1)
- );
- // TODO: replace above with below after 'xsimd' fix
- // xt::xtensor<value_type,1> bin_edges = xt::logspace<value_type>(
- // std::log10(left), std::log10(right), bins+1);
- return bin_edges;
- }
- // same amount of data in each bin
- case histogram_algorithm::uniform:
- {
- // indices that sort "data"
- auto sorter = xt::argsort(data);
- // histogram: all of equal 'height'
- // - height
- weights_type w = xt::sum<weights_type>(weights)[0] / static_cast<weights_type>(bins);
- // - apply to all bins
- xt::xtensor<weights_type, 1> count = w * xt::ones<weights_type>({bins});
- // take cumulative sum, to act as easy look-up
- count = xt::cumsum(count);
- // edges
- // - allocate
- std::vector<size_t> shape = {bins + 1};
- xt::xtensor<value_type, 1> bin_edges = xtensor<value_type, 1>::from_shape(shape);
- // - first/last edge
- bin_edges[0] = left;
- bin_edges[bins] = right;
- // - cumulative weight
- weights_type cum_weight = static_cast<weights_type>(0);
- // - current bin
- size_type ibin = 0;
- // - loop to find interior bin-edges
- for (size_type i = 0; i < weights.size(); ++i)
- {
- if (cum_weight >= count[ibin])
- {
- bin_edges[ibin + 1] = data[sorter[i]];
- ++ibin;
- }
- cum_weight += weights[sorter[i]];
- }
- return bin_edges;
- }
- // bins of equal width
- default:
- {
- xt::xtensor<value_type, 1> bin_edges = xt::linspace<value_type>(left, right, bins + 1);
- return bin_edges;
- }
- }
- }
- /**
- * @ingroup histogram
- * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
- *
- * @param data The data.
- * @param weights Weight factors corresponding to each data-point.
- * @param bins The number of bins. [default: 10]
- * @param mode The type of algorithm to use. [default: "auto"]
- * @return An one-dimensional xarray<double>, length: bins+1.
- */
- template <class E1, class E2>
- inline auto histogram_bin_edges(
- E1&& data,
- E2&& weights,
- std::size_t bins = 10,
- histogram_algorithm mode = histogram_algorithm::automatic
- )
- {
- using value_type = typename std::decay_t<E1>::value_type;
- std::array<value_type, 2> left_right;
- left_right = xt::minmax(data)();
- return histogram_bin_edges(
- std::forward<E1>(data),
- std::forward<E2>(weights),
- left_right[0],
- left_right[1],
- bins,
- mode
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
- *
- * @param data The data.
- * @param bins The number of bins. [default: 10]
- * @param mode The type of algorithm to use. [default: "auto"]
- * @return An one-dimensional xarray<double>, length: bins+1.
- */
- template <class E1>
- inline auto
- histogram_bin_edges(E1&& data, std::size_t bins = 10, histogram_algorithm mode = histogram_algorithm::automatic)
- {
- using value_type = typename std::decay_t<E1>::value_type;
- auto n = data.size();
- std::array<value_type, 2> left_right;
- left_right = xt::minmax(data)();
- return histogram_bin_edges(
- std::forward<E1>(data),
- xt::ones<value_type>({n}),
- left_right[0],
- left_right[1],
- bins,
- mode
- );
- }
- /**
- * @ingroup histogram
- * @brief Compute the bin-edges of a histogram of a set of data using different algorithms.
- *
- * @param data The data.
- * @param left The lower-most edge.
- * @param right The upper-most edge.
- * @param bins The number of bins. [default: 10]
- * @param mode The type of algorithm to use. [default: "auto"]
- * @return An one-dimensional xarray<double>, length: bins+1.
- */
- template <class E1, class E2>
- inline auto histogram_bin_edges(
- E1&& data,
- E2 left,
- E2 right,
- std::size_t bins = 10,
- histogram_algorithm mode = histogram_algorithm::automatic
- )
- {
- using value_type = typename std::decay_t<E1>::value_type;
- auto n = data.size();
- return histogram_bin_edges(std::forward<E1>(data), xt::ones<value_type>({n}), left, right, bins, mode);
- }
- /**
- * Count number of occurrences of each value in array of non-negative ints.
- *
- * The number of bins (of size 1) is one larger than the largest value in x.
- * If minlength is specified, there will be at least this number of bins in
- * the output array (though it will be longer if necessary, depending on the
- * contents of x). Each bin gives the number of occurrences of its index
- * value in x. If weights is specified the input array is weighted by it,
- * i.e. if a value ``n`` is found at position ``i``, ``out[n] += weight[i]``
- * instead of ``out[n] += 1``.
- *
- * @param data the 1D container with integers to count into bins
- * @param weights a 1D container with the same number of elements as ``data``
- * @param minlength The minlength
- *
- * @return 1D container with the bincount
- */
- template <class E1, class E2, XTL_REQUIRES(is_xexpression<std::decay_t<E2>>)>
- inline auto bincount(E1&& data, E2&& weights, std::size_t minlength = 0)
- {
- using result_value_type = typename std::decay_t<E2>::value_type;
- using input_value_type = typename std::decay_t<E1>::value_type;
- using size_type = typename std::decay_t<E1>::size_type;
- static_assert(
- xtl::is_integral<typename std::decay_t<E1>::value_type>::value,
- "Bincount data has to be integral type."
- );
- XTENSOR_ASSERT(data.dimension() == 1);
- XTENSOR_ASSERT(weights.dimension() == 1);
- std::array<input_value_type, 2> left_right;
- left_right = xt::minmax(data)();
- if (left_right[0] < input_value_type(0))
- {
- XTENSOR_THROW(std::runtime_error, "Data argument for bincount can only contain positive integers!");
- }
- xt::xtensor<result_value_type, 1> res = xt::zeros<result_value_type>(
- {(std::max)(minlength, std::size_t(left_right[1] + 1))}
- );
- for (size_type i = 0; i < data.size(); ++i)
- {
- res(data(i)) += weights(i);
- }
- return res;
- }
- template <class E1>
- inline auto bincount(E1&& data, std::size_t minlength = 0)
- {
- return bincount(
- std::forward<E1>(data),
- xt::ones<typename std::decay_t<E1>::value_type>(data.shape()),
- minlength
- );
- }
- /**
- * Get the number of items in each bin, given the fraction of items per bin.
- * The output is such that the total number of items of all bins is exactly "N".
- *
- * @param N the number of items to distribute
- * @param weights fraction of items per bin: a 1D container whose size is the number of bins
- *
- * @return 1D container with the number of items per bin
- */
- template <class E>
- inline xt::xtensor<size_t, 1> bin_items(size_t N, E&& weights)
- {
- if (weights.size() <= std::size_t(1))
- {
- xt::xtensor<size_t, 1> n = N * xt::ones<size_t>({1});
- return n;
- }
- #ifdef XTENSOR_ENABLE_ASSERT
- using value_type = typename std::decay_t<E>::value_type;
- XTENSOR_ASSERT(xt::all(weights >= static_cast<value_type>(0)));
- XTENSOR_ASSERT(xt::sum(weights)() > static_cast<value_type>(0));
- #endif
- xt::xtensor<double, 1> P = xt::cast<double>(weights) / static_cast<double>(xt::sum(weights)());
- xt::xtensor<size_t, 1> n = xt::ceil(static_cast<double>(N) * P);
- if (xt::sum(n)() == N)
- {
- return n;
- }
- xt::xtensor<size_t, 1> d = xt::zeros<size_t>(P.shape());
- xt::xtensor<size_t, 1> sorter = xt::argsort(P);
- sorter = xt::view(sorter, xt::range(P.size(), _, -1));
- sorter = xt::view(sorter, xt::range(0, xt::sum(n)(0) - N));
- xt::view(d, xt::keep(sorter)) = 1;
- n -= d;
- return n;
- }
- /**
- * Get the number of items in each bin, with each bin having approximately the same number of
- * items in it,under the constraint that the total number of items of all bins is exactly "N".
- *
- * @param N the number of items to distribute
- * @param bins the number of bins
- *
- * @return 1D container with the number of items per bin
- */
- inline xt::xtensor<size_t, 1> bin_items(size_t N, size_t bins)
- {
- return bin_items(N, xt::ones<double>({bins}));
- }
- }
- #endif
|