Skip to main content
Module

x/simplestatistics/src/ckmeans.js

simple statistics for node & browser javascript
Go to Latest
File
import makeMatrix from "./make_matrix.js";import numericSort from "./numeric_sort.js";import uniqueCountSorted from "./unique_count_sorted.js";
/** * Generates incrementally computed values based on the sums and sums of * squares for the data array * * @private * @param {number} j * @param {number} i * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares * @return {number} * @example * ssq(0, 1, [-1, 0, 2], [1, 1, 5]); */function ssq(j, i, sums, sumsOfSquares) { let sji; // s(j, i) if (j > 0) { const muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i) sji = sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji; } else { sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1); } if (sji < 0) { return 0; } return sji;}
/** * Function that recursively divides and conquers computations * for cluster j * * @private * @param {number} iMin Minimum index in cluster to be computed * @param {number} iMax Maximum index in cluster to be computed * @param {number} cluster Index of the cluster currently being computed * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix * @param {Array<number>} sums * @param {Array<number>} sumsOfSquares */function fillMatrixColumn( iMin, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares) { if (iMin > iMax) { return; }
// Start at midpoint between iMin and iMax const i = Math.floor((iMin + iMax) / 2);
matrix[cluster][i] = matrix[cluster - 1][i - 1]; backtrackMatrix[cluster][i] = i;
let jlow = cluster; // the lower end for j
if (iMin > cluster) { jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0); } jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
let jhigh = i - 1; // the upper end for j if (iMax < matrix[0].length - 1) { jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0); }
let sji; let sjlowi; let ssqjlow; let ssqj; for (let j = jhigh; j >= jlow; --j) { sji = ssq(j, i, sums, sumsOfSquares);
if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) { break; }
// Examine the lower bound of the cluster border sjlowi = ssq(jlow, i, sums, sumsOfSquares);
ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
if (ssqjlow < matrix[cluster][i]) { // Shrink the lower bound matrix[cluster][i] = ssqjlow; backtrackMatrix[cluster][i] = jlow; } jlow++;
ssqj = sji + matrix[cluster - 1][j - 1]; if (ssqj < matrix[cluster][i]) { matrix[cluster][i] = ssqj; backtrackMatrix[cluster][i] = j; } }
fillMatrixColumn( iMin, i - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); fillMatrixColumn( i + 1, iMax, cluster, matrix, backtrackMatrix, sums, sumsOfSquares );}
/** * Initializes the main matrices used in Ckmeans and kicks * off the divide and conquer cluster computation strategy * * @private * @param {Array<number>} data sorted array of values * @param {Array<Array<number>>} matrix * @param {Array<Array<number>>} backtrackMatrix */function fillMatrices(data, matrix, backtrackMatrix) { const nValues = matrix[0].length;
// Shift values by the median to improve numeric stability const shift = data[Math.floor(nValues / 2)];
// Cumulative sum and cumulative sum of squares for all values in data array const sums = []; const sumsOfSquares = [];
// Initialize first column in matrix & backtrackMatrix for (let i = 0, shiftedValue; i < nValues; ++i) { shiftedValue = data[i] - shift; if (i === 0) { sums.push(shiftedValue); sumsOfSquares.push(shiftedValue * shiftedValue); } else { sums.push(sums[i - 1] + shiftedValue); sumsOfSquares.push( sumsOfSquares[i - 1] + shiftedValue * shiftedValue ); }
// Initialize for cluster = 0 matrix[0][i] = ssq(0, i, sums, sumsOfSquares); backtrackMatrix[0][i] = 0; }
// Initialize the rest of the columns let iMin; for (let cluster = 1; cluster < matrix.length; ++cluster) { if (cluster < matrix.length - 1) { iMin = cluster; } else { // No need to compute matrix[K-1][0] ... matrix[K-1][N-2] iMin = nValues - 1; }
fillMatrixColumn( iMin, nValues - 1, cluster, matrix, backtrackMatrix, sums, sumsOfSquares ); }}
/** * Ckmeans clustering is an improvement on heuristic-based clustering * approaches like Jenks. The algorithm was developed in * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf) * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach * to the problem of clustering numeric data into groups with the least * within-group sum-of-squared-deviations. * * Minimizing the difference within groups - what Wang & Song refer to as * `withinss`, or within sum-of-squares, means that groups are optimally * homogenous within and the data is split into representative groups. * This is very useful for visualization, where you may want to represent * a continuous variable in discrete color or style groups. This function * can provide groups that emphasize differences between data. * * Being a dynamic approach, this algorithm is based on two matrices that * store incrementally-computed values for squared deviations and backtracking * indexes. * * This implementation is based on Ckmeans 3.4.6, which introduced a new divide * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)). * * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html), * this implementation does not include any code to automatically determine * the optimal number of clusters: this information needs to be explicitly * provided. * * ### References * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859 * * from The R Journal Vol. 3/2, December 2011 * @param {Array<number>} x input data, as an array of number values * @param {number} nClusters number of desired classes. This cannot be * greater than the number of values in the data array. * @returns {Array<Array<number>>} clustered input * @throws {Error} if the number of requested clusters is higher than the size of the data * @example * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3); * // The input, clustered into groups of similar numbers. * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]); */function ckmeans(x, nClusters) { if (nClusters > x.length) { throw new Error( "cannot generate more classes than there are data values" ); }
const sorted = numericSort(x); // we'll use this as the maximum number of clusters const uniqueCount = uniqueCountSorted(sorted);
// if all of the input values are identical, there's one cluster // with all of the input in it. if (uniqueCount === 1) { return [sorted]; }
// named 'S' originally const matrix = makeMatrix(nClusters, sorted.length); // named 'J' originally const backtrackMatrix = makeMatrix(nClusters, sorted.length);
// This is a dynamic programming way to solve the problem of minimizing // within-cluster sum of squares. It's similar to linear regression // in this way, and this calculation incrementally computes the // sum of squares that are later read. fillMatrices(sorted, matrix, backtrackMatrix);
// The real work of Ckmeans clustering happens in the matrix generation: // the generated matrices encode all possible clustering combinations, and // once they're generated we can solve for the best clustering groups // very quickly. const clusters = []; let clusterRight = backtrackMatrix[0].length - 1;
// Backtrack the clusters from the dynamic programming matrix. This // starts at the bottom-right corner of the matrix (if the top-left is 0, 0), // and moves the cluster target with the loop. for (let cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) { const clusterLeft = backtrackMatrix[cluster][clusterRight];
// fill the cluster from the sorted input by taking a slice of the // array. the backtrack matrix makes this easy - it stores the // indexes where the cluster should start and end. clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
if (cluster > 0) { clusterRight = clusterLeft - 1; } }
return clusters;}
export default ckmeans;