#pragma once #include /** * Adaptive histogram based on something like streaming k-means crossed with Q-digest. * The implementation is a direct descendent of MergingDigest * https://github.com/tdunning/t-digest/ * * Copyright (c) 2021 Redis, All rights reserved. * Copyright (c) 2018 Andrew Werner, All rights reserved. * * The special characteristics of this algorithm are: * * - smaller summaries than Q-digest * * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy * for middle quantiles * * - fast * * - simple * * - easy to adapt for use with map-reduce */ #define MM_PI 3.14159265358979323846 struct td_histogram { // compression is a setting used to configure the size of centroids when merged. double compression; double min; double max; // cap is the total size of nodes int cap; // merged_nodes is the number of merged nodes at the front of nodes. int merged_nodes; // unmerged_nodes is the number of buffered nodes. int unmerged_nodes; // we run the merge in reverse every other merge to avoid left-to-right bias in merging long long total_compressions; long long merged_weight; long long unmerged_weight; double *nodes_mean; long long *nodes_weight; }; typedef struct td_histogram td_histogram_t; #ifdef __cplusplus extern "C" { #endif /** * Allocate the memory, initialise the t-digest, and return the histogram as output parameter. * @param compression The compression parameter. * 100 is a common value for normal uses. * 1000 is extremely large. * The number of centroids retained will be a smallish (usually less than 10) multiple of this * number. * @return the histogram on success, NULL if allocation failed. */ td_histogram_t *td_new(double compression); /** * Allocate the memory and initialise the t-digest. * * @param compression The compression parameter. * 100 is a common value for normal uses. * 1000 is extremely large. * The number of centroids retained will be a smallish (usually less than 10) multiple of this * number. * @param result Output parameter to capture allocated histogram. * @return 0 on success, 1 if allocation failed. */ int td_init(double compression, td_histogram_t **result); /** * Frees the memory associated with the t-digest. * * @param h The histogram you want to free. */ void td_free(td_histogram_t *h); /** * Reset a histogram to zero - empty out a histogram and re-initialise it * * If you want to re-use an existing histogram, but reset everything back to zero, this * is the routine to use. * * @param h The histogram you want to reset to empty. * */ void td_reset(td_histogram_t *h); /** * Adds a sample to a histogram. * * @param val The value to add. * @param weight The weight of this point. * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided * weight. * */ int td_add(td_histogram_t *h, double val, long long weight); /** * Re-examines a t-digest to determine whether some centroids are redundant. If your data are * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. * * The cost is roughly the same as adding as many data points as there are centroids. This * is typically < 10 * compression, but could be as high as 100 * compression. * This is a destructive operation that is not thread-safe. * * @param h The histogram you want to compress. * @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided * weight. If overflow is detected the histogram is not changed. * */ int td_compress(td_histogram_t *h); /** * Merges all of the values from 'from' to 'this' histogram. * * @param h "This" pointer * @param from Histogram to copy values from. * * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the * provided histogram. If overflow is detected the original histogram is not detected. */ int td_merge(td_histogram_t *h, td_histogram_t *from); /** * Returns the fraction of all points added which are ≤ x. * * @param x The cutoff for the cdf. * @return The fraction of all data which is less or equal to x. */ double td_cdf(td_histogram_t *h, double x); /** * Returns an estimate of the cutoff such that a specified fraction of the data * added to this TDigest would be less than or equal to the cutoff. * * @param q The desired fraction * @return The value x such that cdf(x) == q; */ double td_quantile(td_histogram_t *h, double q); /** * Returns an estimate of the cutoff such that a specified fraction of the data * added to this TDigest would be less than or equal to the cutoffs. * * @param quantiles The ordered percentiles array to get the values for. * @param values Destination array containing the values at the given quantiles. * The values array should be allocated by the caller. * @return 0 on success, ENOMEM if the provided destination array is null. */ int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length); /** * Returns the trimmed mean ignoring values outside given cutoff upper and lower limits. * * @param leftmost_cut Fraction to cut off of the left tail of the distribution. * @param rightmost_cut Fraction to cut off of the right tail of the distribution. * @return The trimmed mean ignoring values outside given cutoff upper and lower limits; */ double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut); /** * Returns the trimmed mean ignoring values outside given a symmetric cutoff limits. * * @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution. * @return The trimmed mean ignoring values outside given cutoff upper and lower limits; */ double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut); /** * Returns the current compression factor. * * @return The compression factor originally used to set up the TDigest. */ int td_compression(td_histogram_t *h); /** * Returns the number of points that have been added to this TDigest. * * @return The sum of the weights on all centroids. */ long long td_size(td_histogram_t *h); /** * Returns the number of centroids being used by this TDigest. * * @return The number of centroids being used. */ int td_centroid_count(td_histogram_t *h); /** * Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram * is empty. * * @param h "This" pointer */ double td_min(td_histogram_t *h); /** * Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram * is empty. * * @param h "This" pointer */ double td_max(td_histogram_t *h); /** * Get the full centroids weight array for 'this' histogram. * * @param h "This" pointer * * @return The full centroids weight array. */ const long long *td_centroids_weight(td_histogram_t *h); /** * Get the full centroids mean array for 'this' histogram. * * @param h "This" pointer * * @return The full centroids mean array. */ const double *td_centroids_mean(td_histogram_t *h); /** * Get the centroid weight for 'this' histogram and 'pos'. * * @param h "This" pointer * @param pos centroid position. * * @return The centroid weight. */ long long td_centroids_weight_at(td_histogram_t *h, int pos); /** * Get the centroid mean for 'this' histogram and 'pos'. * * @param h "This" pointer * @param pos centroid position. * * @return The centroid mean. */ double td_centroids_mean_at(td_histogram_t *h, int pos); #ifdef __cplusplus } #endif