258 lines
7.5 KiB
C
258 lines
7.5 KiB
C
#pragma once
|
|
#include <stdlib.h>
|
|
|
|
/**
|
|
* Adaptive histogram based on something like streaming k-means crossed with Q-digest.
|
|
* The implementation is a direct descendent of MergingDigest
|
|
* https://github.com/tdunning/t-digest/
|
|
*
|
|
* Copyright (c) 2021 Redis, All rights reserved.
|
|
* Copyright (c) 2018 Andrew Werner, All rights reserved.
|
|
*
|
|
* The special characteristics of this algorithm are:
|
|
*
|
|
* - smaller summaries than Q-digest
|
|
*
|
|
* - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy
|
|
* for middle quantiles
|
|
*
|
|
* - fast
|
|
*
|
|
* - simple
|
|
*
|
|
* - easy to adapt for use with map-reduce
|
|
*/
|
|
|
|
#define MM_PI 3.14159265358979323846
|
|
|
|
struct td_histogram {
|
|
// compression is a setting used to configure the size of centroids when merged.
|
|
double compression;
|
|
|
|
double min;
|
|
double max;
|
|
|
|
// cap is the total size of nodes
|
|
int cap;
|
|
// merged_nodes is the number of merged nodes at the front of nodes.
|
|
int merged_nodes;
|
|
// unmerged_nodes is the number of buffered nodes.
|
|
int unmerged_nodes;
|
|
|
|
// we run the merge in reverse every other merge to avoid left-to-right bias in merging
|
|
long long total_compressions;
|
|
|
|
long long merged_weight;
|
|
long long unmerged_weight;
|
|
|
|
double *nodes_mean;
|
|
long long *nodes_weight;
|
|
};
|
|
|
|
typedef struct td_histogram td_histogram_t;
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
/**
|
|
* Allocate the memory, initialise the t-digest, and return the histogram as output parameter.
|
|
* @param compression The compression parameter.
|
|
* 100 is a common value for normal uses.
|
|
* 1000 is extremely large.
|
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
|
* number.
|
|
* @return the histogram on success, NULL if allocation failed.
|
|
*/
|
|
td_histogram_t *td_new(double compression);
|
|
|
|
/**
|
|
* Allocate the memory and initialise the t-digest.
|
|
*
|
|
* @param compression The compression parameter.
|
|
* 100 is a common value for normal uses.
|
|
* 1000 is extremely large.
|
|
* The number of centroids retained will be a smallish (usually less than 10) multiple of this
|
|
* number.
|
|
* @param result Output parameter to capture allocated histogram.
|
|
* @return 0 on success, 1 if allocation failed.
|
|
*/
|
|
int td_init(double compression, td_histogram_t **result);
|
|
|
|
/**
|
|
* Frees the memory associated with the t-digest.
|
|
*
|
|
* @param h The histogram you want to free.
|
|
*/
|
|
void td_free(td_histogram_t *h);
|
|
|
|
/**
|
|
* Reset a histogram to zero - empty out a histogram and re-initialise it
|
|
*
|
|
* If you want to re-use an existing histogram, but reset everything back to zero, this
|
|
* is the routine to use.
|
|
*
|
|
* @param h The histogram you want to reset to empty.
|
|
*
|
|
*/
|
|
void td_reset(td_histogram_t *h);
|
|
|
|
/**
|
|
* Adds a sample to a histogram.
|
|
*
|
|
* @param val The value to add.
|
|
* @param weight The weight of this point.
|
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
|
* weight.
|
|
*
|
|
*/
|
|
int td_add(td_histogram_t *h, double val, long long weight);
|
|
|
|
/**
|
|
* Re-examines a t-digest to determine whether some centroids are redundant. If your data are
|
|
* perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space.
|
|
*
|
|
* The cost is roughly the same as adding as many data points as there are centroids. This
|
|
* is typically < 10 * compression, but could be as high as 100 * compression.
|
|
* This is a destructive operation that is not thread-safe.
|
|
*
|
|
* @param h The histogram you want to compress.
|
|
* @return 0 on success, EDOM if overflow was detected as a consequence of adding the provided
|
|
* weight. If overflow is detected the histogram is not changed.
|
|
*
|
|
*/
|
|
int td_compress(td_histogram_t *h);
|
|
|
|
/**
|
|
* Merges all of the values from 'from' to 'this' histogram.
|
|
*
|
|
* @param h "This" pointer
|
|
* @param from Histogram to copy values from.
|
|
* * @return 0 on success, EDOM if overflow was detected as a consequence of merging the the
|
|
* provided histogram. If overflow is detected the original histogram is not detected.
|
|
*/
|
|
int td_merge(td_histogram_t *h, td_histogram_t *from);
|
|
|
|
/**
|
|
* Returns the fraction of all points added which are ≤ x.
|
|
*
|
|
* @param x The cutoff for the cdf.
|
|
* @return The fraction of all data which is less or equal to x.
|
|
*/
|
|
double td_cdf(td_histogram_t *h, double x);
|
|
|
|
/**
|
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
|
* added to this TDigest would be less than or equal to the cutoff.
|
|
*
|
|
* @param q The desired fraction
|
|
* @return The value x such that cdf(x) == q;
|
|
*/
|
|
double td_quantile(td_histogram_t *h, double q);
|
|
|
|
/**
|
|
* Returns an estimate of the cutoff such that a specified fraction of the data
|
|
* added to this TDigest would be less than or equal to the cutoffs.
|
|
*
|
|
* @param quantiles The ordered percentiles array to get the values for.
|
|
* @param values Destination array containing the values at the given quantiles.
|
|
* The values array should be allocated by the caller.
|
|
* @return 0 on success, ENOMEM if the provided destination array is null.
|
|
*/
|
|
int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, size_t length);
|
|
|
|
/**
|
|
* Returns the trimmed mean ignoring values outside given cutoff upper and lower limits.
|
|
*
|
|
* @param leftmost_cut Fraction to cut off of the left tail of the distribution.
|
|
* @param rightmost_cut Fraction to cut off of the right tail of the distribution.
|
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
|
*/
|
|
double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_cut);
|
|
|
|
/**
|
|
* Returns the trimmed mean ignoring values outside given a symmetric cutoff limits.
|
|
*
|
|
* @param proportion_to_cut Fraction to cut off of the left and right tails of the distribution.
|
|
* @return The trimmed mean ignoring values outside given cutoff upper and lower limits;
|
|
*/
|
|
double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut);
|
|
|
|
/**
|
|
* Returns the current compression factor.
|
|
*
|
|
* @return The compression factor originally used to set up the TDigest.
|
|
*/
|
|
int td_compression(td_histogram_t *h);
|
|
|
|
/**
|
|
* Returns the number of points that have been added to this TDigest.
|
|
*
|
|
* @return The sum of the weights on all centroids.
|
|
*/
|
|
long long td_size(td_histogram_t *h);
|
|
|
|
/**
|
|
* Returns the number of centroids being used by this TDigest.
|
|
*
|
|
* @return The number of centroids being used.
|
|
*/
|
|
int td_centroid_count(td_histogram_t *h);
|
|
|
|
/**
|
|
* Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram
|
|
* is empty.
|
|
*
|
|
* @param h "This" pointer
|
|
*/
|
|
double td_min(td_histogram_t *h);
|
|
|
|
/**
|
|
* Get maximum value from the histogram. Will return - __DBL_MAX__ if the histogram
|
|
* is empty.
|
|
*
|
|
* @param h "This" pointer
|
|
*/
|
|
double td_max(td_histogram_t *h);
|
|
|
|
/**
|
|
* Get the full centroids weight array for 'this' histogram.
|
|
*
|
|
* @param h "This" pointer
|
|
*
|
|
* @return The full centroids weight array.
|
|
*/
|
|
const long long *td_centroids_weight(td_histogram_t *h);
|
|
|
|
/**
|
|
* Get the full centroids mean array for 'this' histogram.
|
|
*
|
|
* @param h "This" pointer
|
|
*
|
|
* @return The full centroids mean array.
|
|
*/
|
|
const double *td_centroids_mean(td_histogram_t *h);
|
|
|
|
/**
|
|
* Get the centroid weight for 'this' histogram and 'pos'.
|
|
*
|
|
* @param h "This" pointer
|
|
* @param pos centroid position.
|
|
*
|
|
* @return The centroid weight.
|
|
*/
|
|
long long td_centroids_weight_at(td_histogram_t *h, int pos);
|
|
|
|
/**
|
|
* Get the centroid mean for 'this' histogram and 'pos'.
|
|
*
|
|
* @param h "This" pointer
|
|
* @param pos centroid position.
|
|
*
|
|
* @return The centroid mean.
|
|
*/
|
|
double td_centroids_mean_at(td_histogram_t *h, int pos);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|