/src/ckmeans.js | simplestatistics@v7.7.5

simple statistics for node & browser javascript
File
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
import makeMatrix from "./make_matrix.js";import numericSort from "./numeric_sort.js";import uniqueCountSorted from "./unique_count_sorted.js";
/** * Generates incrementally computed values based on the sums and sums of * squares for the data array * * @private * @param {number} j * @param {number} i * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares * @return {number} * @example * ssq(0, 1, [-1, 0, 2], [1, 1, 5]); */function ssq(j, i, sums, sumsOfSquares) {    let sji; // s(j, i)    if (j > 0) {        const muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i)        sji =            sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji;    } else {        sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1);    }    if (sji < 0) {        return 0;    }    return sji;}
/** * Function that recursively divides and conquers computations * for cluster j * * @private * @param {number} iMin Minimum index in cluster to be computed * @param {number} iMax Maximum index in cluster to be computed * @param {number} cluster Index of the cluster currently being computed * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares */function fillMatrixColumn(    iMin,    iMax,    cluster,    matrix,    backtrackMatrix,    sums,    sumsOfSquares) {    if (iMin > iMax) {        return;    }
    // Start at midpoint between iMin and iMax    const i = Math.floor((iMin + iMax) / 2);
    matrix[cluster][i] = matrix[cluster - 1][i - 1];    backtrackMatrix[cluster][i] = i;
    let jlow = cluster; // the lower end for j
    if (iMin > cluster) {        jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0);    }    jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
    let jhigh = i - 1; // the upper end for j    if (iMax < matrix[0].length - 1) {        jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0);    }
    let sji;    let sjlowi;    let ssqjlow;    let ssqj;    for (let j = jhigh; j >= jlow; --j) {        sji = ssq(j, i, sums, sumsOfSquares);
        if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) {            break;        }
        // Examine the lower bound of the cluster border        sjlowi = ssq(jlow, i, sums, sumsOfSquares);
        ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
        if (ssqjlow < matrix[cluster][i]) {            // Shrink the lower bound            matrix[cluster][i] = ssqjlow;            backtrackMatrix[cluster][i] = jlow;        }        jlow++;
        ssqj = sji + matrix[cluster - 1][j - 1];        if (ssqj < matrix[cluster][i]) {            matrix[cluster][i] = ssqj;            backtrackMatrix[cluster][i] = j;        }    }
    fillMatrixColumn(        iMin,        i - 1,        cluster,        matrix,        backtrackMatrix,        sums,        sumsOfSquares    );    fillMatrixColumn(        i + 1,        iMax,        cluster,        matrix,        backtrackMatrix,        sums,        sumsOfSquares    );}
/** * Initializes the main matrices used in Ckmeans and kicks * off the divide and conquer cluster computation strategy * * @private * @param {Array<number>} data sorted array of values * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix */function fillMatrices(data, matrix, backtrackMatrix) {    const nValues = matrix[0].length;
    // Shift values by the median to improve numeric stability    const shift = data[Math.floor(nValues / 2)];
    // Cumulative sum and cumulative sum of squares for all values in data array    const sums = [];    const sumsOfSquares = [];
    // Initialize first column in matrix & backtrackMatrix    for (let i = 0, shiftedValue; i < nValues; ++i) {        shiftedValue = data[i] - shift;        if (i === 0) {            sums.push(shiftedValue);            sumsOfSquares.push(shiftedValue * shiftedValue);        } else {            sums.push(sums[i - 1] + shiftedValue);            sumsOfSquares.push(                sumsOfSquares[i - 1] + shiftedValue * shiftedValue            );        }
        // Initialize for cluster = 0        matrix[0][i] = ssq(0, i, sums, sumsOfSquares);        backtrackMatrix[0][i] = 0;    }
    // Initialize the rest of the columns    let iMin;    for (let cluster = 1; cluster < matrix.length; ++cluster) {        if (cluster < matrix.length - 1) {            iMin = cluster;        } else {            // No need to compute matrix[K-1][0] ... matrix[K-1][N-2]            iMin = nValues - 1;        }
        fillMatrixColumn(            iMin,            nValues - 1,            cluster,            matrix,            backtrackMatrix,            sums,            sumsOfSquares        );    }}
/** * Ckmeans clustering is an improvement on heuristic-based clustering * approaches like Jenks. The algorithm was developed in * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf) * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach * to the problem of clustering numeric data into groups with the least * within-group sum-of-squared-deviations. * * Minimizing the difference within groups - what Wang & Song refer to as * `withinss`, or within sum-of-squares, means that groups are optimally * homogenous within and the data is split into representative groups. * This is very useful for visualization, where you may want to represent * a continuous variable in discrete color or style groups. This function * can provide groups that emphasize differences between data. * * Being a dynamic approach, this algorithm is based on two matrices that * store incrementally-computed values for squared deviations and backtracking * indexes. * * This implementation is based on Ckmeans 3.4.6, which introduced a new divide * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)). * * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html), * this implementation does not include any code to automatically determine * the optimal number of clusters: this information needs to be explicitly * provided. * * ### References * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859 * * from The R Journal Vol. 3/2, December 2011 * @param {Array<number>} x input data, as an array of number values * @param {number} nClusters number of desired classes. This cannot be * greater than the number of values in the data array. * @returns {Array<Array<number>>} clustered input * @throws {Error} if the number of requested clusters is higher than the size of the data * @example * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3); * // The input, clustered into groups of similar numbers. * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]); */function ckmeans(x, nClusters) {    if (nClusters > x.length) {        throw new Error(            "cannot generate more classes than there are data values"        );    }
    const sorted = numericSort(x);    // we'll use this as the maximum number of clusters    const uniqueCount = uniqueCountSorted(sorted);
    // if all of the input values are identical, there's one cluster    // with all of the input in it.    if (uniqueCount === 1) {        return [sorted];    }
    // named 'S' originally    const matrix = makeMatrix(nClusters, sorted.length);    // named 'J' originally    const backtrackMatrix = makeMatrix(nClusters, sorted.length);
    // This is a dynamic programming way to solve the problem of minimizing    // within-cluster sum of squares. It's similar to linear regression    // in this way, and this calculation incrementally computes the    // sum of squares that are later read.    fillMatrices(sorted, matrix, backtrackMatrix);
    // The real work of Ckmeans clustering happens in the matrix generation:    // the generated matrices encode all possible clustering combinations, and    // once they're generated we can solve for the best clustering groups    // very quickly.    const clusters = [];    let clusterRight = backtrackMatrix[0].length - 1;
    // Backtrack the clusters from the dynamic programming matrix. This    // starts at the bottom-right corner of the matrix (if the top-left is 0, 0),    // and moves the cluster target with the loop.    for (let cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) {        const clusterLeft = backtrackMatrix[cluster][clusterRight];
        // fill the cluster from the sorted input by taking a slice of the        // array. the backtrack matrix makes this easy - it stores the        // indexes where the cluster should start and end.        clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
        if (cluster > 0) {            clusterRight = clusterLeft - 1;        }    }
    return clusters;}
export default ckmeans;