Skip to content

Commit

Permalink
[ML] Move chi2test to package (#167237)
Browse files Browse the repository at this point in the history
The `chi2test` utils so fare were only used within data comparison view.
We plan to use it with other plugins, so moving it so a separate package
in this PR. `SIGNIFICANCE_LEVELS` was updated to include some more
digits.
  • Loading branch information
walterra authored Sep 27, 2023
1 parent 93fc807 commit 1b9993e
Show file tree
Hide file tree
Showing 21 changed files with 1,247 additions and 1,092 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ x-pack/plugins/metrics_data_access @elastic/infra-monitoring-ui
x-pack/packages/ml/agg_utils @elastic/ml-ui
x-pack/packages/ml/anomaly_utils @elastic/ml-ui
x-pack/packages/ml/category_validator @elastic/ml-ui
x-pack/packages/ml/chi2test @elastic/ml-ui
x-pack/packages/ml/data_frame_analytics_utils @elastic/ml-ui
x-pack/packages/ml/data_grid @elastic/ml-ui
x-pack/packages/ml/date_picker @elastic/ml-ui
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
"@kbn/ml-chi2test": "link:x-pack/packages/ml/chi2test",
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",
Expand Down
2 changes: 2 additions & 0 deletions tsconfig.base.json
Original file line number Diff line number Diff line change
Expand Up @@ -998,6 +998,8 @@
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
"@kbn/ml-chi2test": ["x-pack/packages/ml/chi2test"],
"@kbn/ml-chi2test/*": ["x-pack/packages/ml/chi2test/*"],
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],
Expand Down
4 changes: 4 additions & 0 deletions x-pack/packages/ml/chi2test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# @kbn/ml-chi2test

`computeChi2PValue` computes the p-value for how similar the datasets are.
Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
* 2.0.
*/

import { computeChi2PValue } from './data_drift_utils';
import { Histogram } from './types';
import { computeChi2PValue } from './compute_chi_2_pvalue';
import type { Histogram } from './types';

describe('computeChi2PValue()', () => {
test('should return close to 1 if datasets are both empty or nearly identical', () => {
Expand Down Expand Up @@ -83,6 +83,6 @@ describe('computeChi2PValue()', () => {
percentage: 1,
},
];
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0);
expect(computeChi2PValue(referenceTerms, comparisonTerms)).toStrictEqual(0.000001);
});
});
48 changes: 48 additions & 0 deletions x-pack/packages/ml/chi2test/compute_chi_2_pvalue.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { criticalTableLookup } from './critical_table_lookup';
import type { Histogram } from './types';

/**
* Compute the p-value for how similar the datasets are.
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
*
* @param {Histogram[]} normalizedBaselineTerms - An array of normalized baseline terms (Histogram objects).
* @param {Histogram[]} normalizedDriftedTerms - An array of normalized drifted terms (Histogram objects).
* @returns {number} The p-value indicating the similarity of the datasets.
*/
export const computeChi2PValue = (
normalizedBaselineTerms: Histogram[],
normalizedDriftedTerms: Histogram[]
) => {
// Get all unique keys from both arrays
const allKeys: string[] = Array.from(
new Set([
...normalizedBaselineTerms.map((term) => term.key.toString()),
...normalizedDriftedTerms.map((term) => term.key.toString()),
])
).slice(0, 100);

// Calculate the chi-squared statistic and degrees of freedom
let chiSquared: number = 0;
const degreesOfFreedom: number = allKeys.length - 1;

if (degreesOfFreedom === 0) return 1;

allKeys.forEach((key) => {
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);

const observed: number = driftedTerm?.percentage ?? 0;
const expected: number = baselineTerm?.percentage ?? 0;
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
});

// Use the criticalTableLookup function to determine the p-value
return criticalTableLookup(chiSquared, degreesOfFreedom);
};
1,038 changes: 1,038 additions & 0 deletions x-pack/packages/ml/chi2test/constants.ts

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions x-pack/packages/ml/chi2test/critical_table_lookup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';

/**
* Performs a lookup in a critical values table to determine the significance level
* associated with a given chi-squared statistic and degrees of freedom.
*
* @param {number} chi2Statistic - The chi-squared statistic for which the significance level is to be determined.
* @param {number} df - The degrees of freedom (an integer) for the chi-squared test.
* @returns {number} The significance level corresponding to the chi-squared statistic and degrees of freedom.
* @throws {Error} If df is less than 1 or not an integer.
*/
export const criticalTableLookup = (chi2Statistic: number, df: number) => {
if (df < 1) return 1;
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');

// Get the row index
const rowIndex: number = df - 1;

// Get the column index
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
let columnIndex: number = 0;
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
if (diff < minDiff) {
minDiff = diff;
columnIndex = j;
}
}

// Determine the significance level from the column index
const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
return significanceLevel;
};
11 changes: 11 additions & 0 deletions x-pack/packages/ml/chi2test/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export { computeChi2PValue } from './compute_chi_2_pvalue';
export { criticalTableLookup } from './critical_table_lookup';
export { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
export type { Histogram } from './types';
12 changes: 12 additions & 0 deletions x-pack/packages/ml/chi2test/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

module.exports = {
preset: '@kbn/test',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ml/chi2test'],
};
5 changes: 5 additions & 0 deletions x-pack/packages/ml/chi2test/kibana.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "shared-common",
"id": "@kbn/ml-chi2test",
"owner": "@elastic/ml-ui"
}
6 changes: 6 additions & 0 deletions x-pack/packages/ml/chi2test/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "@kbn/ml-chi2test",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}
19 changes: 19 additions & 0 deletions x-pack/packages/ml/chi2test/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"extends": "../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node",
"react"
]
},
"include": [
"**/*.ts",
"**/*.tsx",
],
"exclude": [
"target/**/*"
],
"kbn_references": []
}
24 changes: 24 additions & 0 deletions x-pack/packages/ml/chi2test/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

/**
* Interface for the Histogram type used by computeChi2PValue.
*/
export interface Histogram {
/**
* The doc count.
*/
doc_count: number;
/**
* The key.
*/
key: string | number;
/**
* Optional percentage.
*/
percentage?: number;
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,20 @@
* 2.0.
*/

import React from 'react';

import { SeriesColorAccessor } from '@elastic/charts/dist/chart_types/xy_chart/utils/specs';
import { Axis, BarSeries, Chart, Position, ScaleType, Settings, Tooltip } from '@elastic/charts';
import React from 'react';

import { FIELD_FORMAT_IDS } from '@kbn/field-formats-plugin/common';
import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
import type { Histogram } from '@kbn/ml-chi2test';

import { DataComparisonChartTooltipBody } from '../data_drift_chart_tooltip_body';
import { NoChartsData } from './no_charts_data';
import { DATA_COMPARISON_TYPE } from '../constants';
import { DataDriftField, Feature, Histogram } from '../types';
import type { DataDriftField, Feature } from '../types';

import { getFieldFormatType, useFieldFormatter } from './default_value_formatter';
import { NoChartsData } from './no_charts_data';

export const SingleDistributionChart = ({
data,
Expand Down
Loading

0 comments on commit 1b9993e

Please sign in to comment.