diff --git a/scripts/builtin/outlierByIsolationForest.dml b/scripts/builtin/outlierByIsolationForest.dml new file mode 100644 index 00000000000..19a9658d6fe --- /dev/null +++ b/scripts/builtin/outlierByIsolationForest.dml @@ -0,0 +1,465 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function that implements anomaly detection via isolation forest as described in +# [Liu2008]: +# Liu, F. T., Ting, K. M., & Zhou, Z. H. +# (2008, December). +# Isolation forest. +# In 2008 eighth ieee international conference on data mining (pp. 413-422). +# IEEE. +# +# This function creates an iForest model for outlier detection. +# +# .. code-block:: python +# +# >>> import numpy as np +# >>> from systemds.context import SystemDSContext +# >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply +# >>> with SystemDSContext() as sds: +# ... # Create training data: 20 points clustered near origin +# ... X_train = sds.from_numpy(np.array([ +# ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], +# ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], +# ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], +# ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] +# ... ])) +# ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) +# ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) +# ... scores = outlierByIsolationForestApply(model, X_test).compute() +# ... print(scores.shape) +# ... print(scores[1, 0] > scores[0, 0]) +# ... print(scores[1, 0] > 0.5) +# (2, 1) +# True +# True +# +# +# INPUT: +# --------------------------------------------------------------------------------------------- +# X Numerical feature matrix +# n_trees Number of iTrees to build +# subsampling_size Size of the subsample to build iTrees with +# seed Seed for calls to `sample` and `rand`. -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------------------------------------- +# iForestModel The trained iForest model to be used in outlierByIsolationForestApply. +# The model is represented as a list with two entries: +# Entry 'model' (Matrix[Double]) - The iForest Model in linearized form (see m_iForest) +# Entry 'subsampling_size' (Double) - The subsampling size used to build the model. +# --------------------------------------------------------------------------------------------- + +s_outlierByIsolationForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) + return(List[Unknown] iForestModel) +{ + iForestModel = m_outlierByIsolationForest(X, n_trees, subsampling_size, seed) +} + +m_outlierByIsolationForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) + return(List[Unknown] iForestModel) +{ + M = m_iForest(X, n_trees, subsampling_size, seed) + iForestModel = list(model=M, subsampling_size=subsampling_size) +} + +# This function implements isolation forest for numerical input features as +# described in [Liu2008]. +# +# The returned 'linearized' model is of type Matrix[Double] where each row +# corresponds to a linearized iTree (see m_iTree). Note that each tree in the +# model is padded with placeholder nodes such that each iTree has the same maximum depth. +# +# .. code-block:: +# +# For example, give a feature matrix with features [a,b,c,d] +# and the following iForest, M would look as follows: +# +# Level Tree 1 Tree 2 Node Depth +# ------------------------------------------------------------------- +# (L1) |d<=5| |b<=6| 0 +# / \ / \ +# (L2) 2 |a<=7| 20 0 1 +# / \ +# (L3) 10 8 2 +# +# --> M := +# [[ 4, 5, 0, 2, 1, 7, -1, -1, -1, -1, 0, 10, 0, 8], (Tree 1) +# [ 2, 6, 0, 20, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1]] (Tree 2) +# | (L1) | | (L2) | | (L3) | +# +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# n_trees Int Number of iTrees to build +# subsampling_size Int Size of the subsample to build iTrees with +# seed Int -1 Seed for calls to `sample` and `rand`. +# -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M Matrix containing the learned iForest in linearized form +# --------------------------------------------------------------------------------------------- +m_iForest = function(Matrix[Double] X, Integer n_trees, Integer subsampling_size, Integer seed = -1) + return(Matrix[Double] M) +{ + # check assumptions + s_warning_assert_outlierByIsolationForest(n_trees > 0, "iForest: Requirement n_trees > 0 not satisfied! ntrees: "+toString(n_trees)) + s_warning_assert_outlierByIsolationForest(subsampling_size > 1 & subsampling_size <= nrow(X), "iForest: Requirement 0 < subsampling_size <= nrow(X) not satisfied! subsampling_size: "+toString(subsampling_size)+"; nrow(X): "+toString(nrow(X))) + + height_limit = ceil(log(subsampling_size, 2)) + tree_size = 2*(2^(height_limit+1)-1) + + # initialize the model + M = matrix(-1, cols=tree_size, rows=n_trees) + seeds = matrix(seq(1, n_trees), cols=n_trees, rows=1)*seed + + parfor ( i_iTree in 1:n_trees, taskpartitioner="STATIC") { + # subsample rows + tree_seed = ifelse(seed == -1, -1, as.scalar(seeds[1, i_iTree])) + X_subsample = s_sampleRows(X, subsampling_size, tree_seed) + + # Build iTree + tree_seed = ifelse(seed == -1, -1, tree_seed+42) + M_tree = m_iTree(X_subsample, height_limit, tree_seed) + + # Add iTree to the model + M[i_iTree, 1:ncol(M_tree)] = M_tree + } +} + +# This function implements isolation trees for numerical input features as +# described in [Liu2008]. +# +# The returned 'linearized' model is of type Matrix[Double] with exactly one row. +# Here, each node is represented by two consecutive entries in this row vector. +# Traversing the row vector from left to right corresponds to traversing the tree +# level-wise from top to bottom and left to right. If a node does not exist +# (e.g. because the parent node is already a leaf node), the node is still stored +# using placeholder values. +# Recall that for a binary tree with maximum depth `d`, the maximum number of nodes +# `can be calculated by `2^(maximum depth + 1) - 1`. Hence, for a given maximum depth +# of an iTree, the row vector will have exactly `2*2^(maximum depth + 1) - 1` entries. +# +# There are three types of nodes that are represented in this model: +# - Internal Node +# A node a that based on a "split feature" and corresponding "split value" +# devides the data into two parts, one of which can potentially be an empty set. +# The node is lineraized in the following way: +# - Entry 1: Represents the index of the splitting feature in the feature matrix `X` +# - Entry 2: Represents splitting value +# +# - External Node +# A leaf node of the tree, It contains the "size" of the node. That is the +# number of remaining samples after splitting the feature matrix X by traversing +# the tree to this node. +# The node is lineraized in the following way: +# - Entry 1: Always 0 - indicating an external node +# - Entry 2: The "size" of the node +# +# - Placeholder Node +# A node that is not present in the actual iTree and is used for "padding". +# Both entries are set to -1 +# +# .. code-block:: +# +# For example, give a feature matrix with features [a,b,c,d] +# and the following tree, M would look as follows: +# Level Tree Node Depth +# ------------------------------------------------- +# (L1) |d<5| 0 +# / \ +# (L2) 1 |a<7| 1 +# / \ +# (L3) 10 0 2 +# +# --> M := +# [[4, 5, 0, 1, 1, 7, -1, -1, -1, -1, 0, 10, 0, 0]] +# |(L1)| | (L2) | | (L3) | +# +# +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# max_depth Int Maximum depth of the learned tree where depth is the +# maximum number of edges from root to a leaf note +# seed Int -1 Seed for calls to `sample` and `rand`. +# -1 corresponds to a random seed +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M Matrix M containing the learned tree in linearized form +# --------------------------------------------------------------------------------------------- +m_iTree = function(Matrix[Double] X, Integer max_depth, Integer seed = -1) + return(Matrix[Double] M) +{ + # check assumptions + s_warning_assert_outlierByIsolationForest(max_depth > 0 & max_depth <= 32, "iTree: Requirement 0 < max_depth < 32 not satisfied! max_depth: " + max_depth) + s_warning_assert_outlierByIsolationForest(nrow(X) > 0, "iTree: Feature matrix X has no less than 2 rows!") + + + # Initialize M to largest possible matrix given max_depth + # Note that each node takes exactly 2 indices in M and the root node has depth 0 + M = matrix(-1, rows=1, cols=2*(2^(max_depth+1)-1)) + + # Queue for implementing recursion in the original algorithm. + # Each entry in the queue corresponds to a node that in the tree to be added to the model + # M and, in case of internal nodes, split further. + # Nodes in this queue are represented by an ID (first index) and the data corrseponding to the node (second index) + node_queue = list(list(1, X)); + # variable tracking the maximum ID of in the tree + max_id = 1; + while (length(node_queue) > 0) { + # pop next node from queue for splitting + [node_queue, queue_entry] = remove(node_queue, 1); + node = as.list(queue_entry); + node_id = as.scalar(node[1]); + X_node = as.matrix(node[2]); + + max_id = max(max_id, node_id) + + is_external_leaf = s_isExternalINode(X_node, node_id, max_depth) + if (is_external_leaf) { + # External Node: Add node to model + M = s_addExternalINode(X_node, node_id, M) + } + else { + # Internal Node: Draw split criterion, add node to model and queue child nodes + seed = ifelse(seed == -1, -1, node_id*seed) + [split_feature, split_value] = s_drawSplitPoint(X_node, seed) + M = s_addInternalINode(node_id, split_feature, split_value, M) + [left_id, X_left, right_id, X_right] = s_splitINode(X_node, node_id, split_feature, split_value) + + node_queue = append(node_queue, list(left_id, X_left)) + node_queue = append(node_queue, list(right_id, X_right)) + } + } + + # Prune the model to the actual tree depth + tree_depth = floor(log(max_id, 2)) + M = M[1, 1:2*(2^(tree_depth+1) - 1)]; +} + + +# Randomly draws a split point i.e. a feature and corresponding value to split a node by. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Numerical feature matrix +# seed Int -1 Seed for calls to `sample` and `rand` +# -1 corresponds to a random seed +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# split_feature Index of the feature used for splitting the node +# split_value Feature value used for splitting the node +# --------------------------------------------------------------------------------------------- +s_drawSplitPoint = function(Matrix[Double] X, Integer seed = -1) + return(Integer split_feature, Double split_value) +{ + # find random feature and a value between the min and max values of that feature to split the node by + split_feature = as.integer(as.scalar(sample(ncol(X), 1, FALSE, seed))) + split_value = as.scalar(rand( + rows=1, cols=1, + min=min(X[, split_feature]), + max=max(X[, split_feature]), + seed=seed + )) +} + +# Adds a external (leaf) node to the linearized iTree model `M`. In the linerized form, +# each node is assigned two neighboring indices. For external nodes the value at the first +# index in M is always set to 0 while the value at the second index is set to the number of +# rows in the feature matrix corresponding to the node. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding to the node +# node_id Int ID of the node +# M Matrix[Double] Linerized model to add the node to +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M The updated model +# --------------------------------------------------------------------------------------------- +s_addExternalINode = function(Matrix[Double] X_node, Integer node_id, Matrix[Double] M) + return(Matrix[Double] M) +{ + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_addExternalINode: Requirement `node_id > 0` not satisfied!") + + node_start_index = 2*node_id-1 + M[, node_start_index] = 0 + M[, node_start_index + 1] = nrow(X_node) +} + +# Adds a internal node to the linearized iTree model `M`. In the linerized form, +# each node is assigned two neighboring indices. For internal nodes the value at the first +# index in M is set to index of the feature to split by while the value at the second index +# is set to the value to split the node by. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# node_id Int ID of the node +# split_feature Int Index of the feature to split the node by +# split_value Int Value to split the node by +# M Matrix[Double] Linerized model to add the node to +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# M The updated model +# --------------------------------------------------------------------------------------------- +s_addInternalINode = function(Integer node_id, Integer split_feature, Double split_value, Matrix[Double] M) + return(Matrix[Double] M) +{ + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_addInternalINode: Requirement `node_id > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(split_feature > 0, "s_addInternalINode: Requirement `split_feature > 0` not satisfied!") + + node_start_index = 2*node_id-1 + M[, node_start_index] = split_feature + M[, node_start_index + 1] = split_value +} + +# This function determines if a iTree node is an external node based on it's node_id and the data corresponding to the node +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding to the node +# node_id Int ID belonging to the node +# max_depth Int Maximum depth of the learned tree where depth is the +# maximum number of edges from root to a leaf note +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# isExternalNote true if the node is an external (leaf) node, false otherwise. +# This is the case when a max depth is reached or the number of rows +# in the feature matrix corresponding to the node <= 1 +# --------------------------------------------------------------------------------------------- +s_isExternalINode = function(Matrix[Double] X_node, Integer node_id, Integer max_depth) + return(Boolean isExternalNode) +{ + s_warning_assert_outlierByIsolationForest(max_depth > 0, "s_isExternalINode: Requirement `max_depth > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_isExternalINode: Requirement `node_id > 0` not satisfied!") + + node_depth = floor(log(node_id, 2)) + isExternalNode = node_depth >= max_depth | nrow(X_node) <= 1 +} + + +# This function splits a node based on a given feature and value and returns the sub-matrices +# and IDs corresponding to the nodes resulting from the split. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X_node Matrix[Double] Numerical feature matrix corresponding +# node_id Int ID of the node to split +# split_feature Int Index of the feature to split the input matrix by +# split_value Int Value of the feature to split the input matrix by +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# left_id ID of the resulting left node +# X_left Matrix corresponding to the left node resulting from the split with rows where +# value for feature `split_feature` <= value `split_value` +# right_id ID of the resulting right node +# X_right Matrix corresponding to the left node resulting from the split with rows where +# value for feature `split_feature` > value `split_value` +# --------------------------------------------------------------------------------------------- +s_splitINode = function(Matrix[Double] X_node, Integer node_id, Integer split_feature, Double split_value) + return(Integer left_id, Matrix[Double] X_left, Integer right_id, Matrix[Double] X_right) +{ + s_warning_assert_outlierByIsolationForest(nrow(X_node) > 0, "s_splitINode: Requirement `nrow(X_node) > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(node_id > 0, "s_splitINode: Requirement `nrow(X_node) > 0` not satisfied!") + s_warning_assert_outlierByIsolationForest(split_feature > 0, "s_splitINode: Requirement `split_feature > 0` not satisfied!") + + left_rows_mask = X_node[, split_feature] <= split_value + + # In the lineraized form of the iTree model, nodes need to be ordered by depth + # Since iTrees are binary trees we can use 2*node_id/2*node_id+1 for left/right child ids + # to insure that IDs are chosen accordingly. + left_id = 2 * node_id + X_left = removeEmpty(target=X_node, margin="rows", select=left_rows_mask, empty.return=FALSE) + + right_id = 2 * node_id + 1 + X_right = removeEmpty(target=X_node, margin="rows", select=!left_rows_mask, empty.return=FALSE) +} + +# Randomly samples `size` rows from a matrix X +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Matrix[Double] Matrix to sample rows from +# sample_size Int Number of rows to sample +# seed Int -1 Seed for calls to `sample` +# -1 corresponds to a random seed +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# X_sampled Sampled rows from X +# --------------------------------------------------------------------------------------------- +s_sampleRows = function(Matrix[Double] X, Integer size, Integer seed = -1) + return(Matrix[Double] X_extracted) +{ + s_warning_assert_outlierByIsolationForest(size > 0 & nrow(X) >= size, "s_sampleRows: Requirements `size > 0 & nrow(X) >= size` not satisfied") + random_vector = rand(rows=nrow(X), cols=1, seed=seed) + X_rand = cbind(X, random_vector) + + # order by random vector and return `size` nr of rows` + X_rand = order(target=X_rand, by=ncol(X_rand)) + X_extracted = X_rand[1:size, 1:ncol(X)] +} + +# Function that gives a warning if a assertion is violated. This is used instead of `assert` and +# `stop` since these function can not be used in parfor . +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# assertion Boolean Assertion to check +# warning String Warning message to print if assertion is violated +# --------------------------------------------------------------------------------------------- +s_warning_assert_outlierByIsolationForest = function(Boolean assertion, String warning) +{ + if (!assertion) + print("outlierIsolationForest: "+warning) +} diff --git a/scripts/builtin/outlierByIsolationForestApply.dml b/scripts/builtin/outlierByIsolationForestApply.dml new file mode 100644 index 00000000000..8951bc08709 --- /dev/null +++ b/scripts/builtin/outlierByIsolationForestApply.dml @@ -0,0 +1,256 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Builtin function that calculates the anomaly score as described in [Liu2008] +# for a set of samples `X` based on an iForest model. +# +# [Liu2008]: +# Liu, F. T., Ting, K. M., & Zhou, Z. H. +# (2008, December). +# Isolation forest. +# In 2008 eighth ieee international conference on data mining (pp. 413-422). +# IEEE. +# +# .. code-block:: python +# +# >>> import numpy as np +# >>> from systemds.context import SystemDSContext +# >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply +# >>> with SystemDSContext() as sds: +# ... # Create training data: 20 points clustered near origin +# ... X_train = sds.from_numpy(np.array([ +# ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], +# ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], +# ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], +# ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] +# ... ])) +# ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) +# ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) +# ... scores = outlierByIsolationForestApply(model, X_test).compute() +# ... print(scores.shape) +# ... print(scores[1, 0] > scores[0, 0]) +# ... print(scores[1, 0] > 0.5) +# (2, 1) +# True +# True +# +# +# INPUT: +# --------------------------------------------------------------------------------------------- +# iForestModel The trained iForest model as returned by outlierByIsolationForest +# X Samples to calculate the anomaly score for +# --------------------------------------------------------------------------------------------- +# +# OUTPUT: +# --------------------------------------------------------------------------------------------- +# anomaly_scores Column vector of anomaly scores corresponding to the samples in X. +# Samples with an anomaly score > 0.5 are generally considered to be outliers +# --------------------------------------------------------------------------------------------- + +s_outlierByIsolationForestApply = function(List[Unknown] iForestModel, Matrix[Double] X) + return(Matrix[Double] anomaly_scores) +{ + anomaly_scores = m_outlierByIsolationForestApply(iForestModel, X) +} + +m_outlierByIsolationForestApply = function(List[Unknown] iForestModel, Matrix[Double] X) + return(Matrix[Double] anomaly_scores) +{ + assert(nrow(X) > 1) + + M = as.matrix(iForestModel['model']) + subsampling_size = as.integer(as.scalar(iForestModel['subsampling_size'])) + assert(subsampling_size > 1) + + height_limit = ceil(log(subsampling_size, 2)) + tree_size = 2*(2^(height_limit+1)-1) + assert(ncol(M) == tree_size & nrow(M) > 1) + + anomaly_scores = matrix(0, rows=nrow(X), cols=1) + for ( i_x in 1:nrow(X)) { + anomaly_scores[i_x, 1] = m_score(M, X[i_x,], subsampling_size) + } +} + +# Calculates the PathLength as defined in [Liu2008] based on a sample x +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] The linearized iTree model +# x Matrix[Double] The sample to calculate the PathLength +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# PathLength The PathLength for the sample +# --------------------------------------------------------------------------------------------- +m_PathLength = function(Matrix[Double] M, Matrix[Double] x) + return(Double PathLength) +{ + [nrEdgesTraversed, externalNodeSize] = s_traverseITree(M, x) + + if (externalNodeSize <= 1) { + PathLength = nrEdgesTraversed + } + else { + PathLength = nrEdgesTraversed + s_cn(externalNodeSize) + } +} + + +# Traverses an iTree based on a sample x +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] The linearized iTree model to traverse +# x Matrix[Double] The sample to traverse the iTree with +# +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# nrEdgesTraversed The number of edges traversed until an external node was reached +# externalNodeSize The size of of the external node assigned to during training +# --------------------------------------------------------------------------------------------- +s_traverseITree = function(Matrix[Double] M, Matrix[Double] x) + return(Integer nrEdgesTraversed, Integer externalNodeSize) +{ + s_warning_assert(nrow(x) == 1, "s_traverseITree: Requirement `nrow(x) == 1` not satisfied!") + + nrEdgesTraversed = 0 + is_external_node = FALSE + node_id = 1 + while (!is_external_node) + { + node_start_idx = (node_id*2) - 1 + split_feature = as.integer(as.scalar(M[1,node_start_idx])) + node_value = as.scalar(M[1,node_start_idx + 1]) + + if (split_feature > 0) { + # internal node - node_value = split_value + nrEdgesTraversed = nrEdgesTraversed + 1 + x_val = as.scalar(x[1, split_feature]) + if (x_val <= node_value) { + # go down left + node_id = (node_id * 2) + } + else { + # go down right + node_id = (node_id * 2) + 1 + } + } + else if (split_feature == 0) { + # External node - node_value = node size + externalNodeSize = as.integer(node_value) + is_external_node = TRUE + } + else { + s_warning_assert(FALSE, "iTree is not valid!") + } + } +} + + +# This function gives the average path length of unsuccessful search in BST `c(n)` +# for `n` nodes as given in [Liu2008]. This function is used to normalize the path length +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# n Int Number of samples that corresponding to an external +# node for which c(n) should be calculated +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# cn Value for c(n) +# --------------------------------------------------------------------------------------------- +s_cn = function(Integer n) + return(Double cn) +{ + s_warning_assert(n > 1, "s_cn: Requirement `n > 1` not satisfied!") + + # Calculate H(n-1) + # The approximation of the Harmonic Number H using `log(n) + eulergamma` has a higher error + # for low n. We hence calculate it directly for the first 1000 values + # TODO: Discuss a good value for n --> use e.g. HarmonicNumber(1000) - (ln(1000) + 0.5772156649) in WA + if (n < 1000) { + indices = seq(1,n-1) + H_nminus1 = sum(1/indices) + + } + else{ + # Euler–Mascheroni's constant + eulergamma = 0.57721566490153 + # Approximation harmonic number H(n - 1) + H_nminus1 = log(n-1) + eulergamma + } + + cn = 2*H_nminus1 - 2*(n-1)/n +} + +# Scors a sample `x` according to score function `s(x, n)` for a sample x and a testset-size n, as described in [Liu2008]. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# M Matrix[Double] iForest model used to score +# x Matrix[Double] Sample to be scored +# n Int Subsample size the iTrees were built from +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# score The score for +# --------------------------------------------------------------------------------------------- +m_score = function(Matrix[Double] M, Matrix[Double] x, Integer n) + return(Double score) +{ + s_warning_assert(n > 1, "m_score: Requirement `n > 1` not satisfied!") + s_warning_assert(nrow(x) == 1, "m_score: sample has the wrong dimension!") + s_warning_assert(nrow(M) > 1, "m_score: invalid iForest Model!") + + h = matrix(0, cols=nrow(M), rows=1) + for (i_iTree in 1:nrow(M)) { + h[1, i_iTree] = m_PathLength(M[i_iTree,], x) + } + + score = 2^-(mean(h)/s_cn(n)) +} + +# Function that gives a warning if a assertion is violated. This is used instead of `assert` and +# `stop` since these function can not be used in parfor . +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# assertion Boolean Assertion to check +# warning String Warning message to print if assertion is violated +# --------------------------------------------------------------------------------------------- +s_warning_assert = function(Boolean assertion, String warning) +{ + if (!assertion) + print("outlierIsolationForest: "+warning) +} \ No newline at end of file diff --git a/scripts/staging/isolationForest/test/isolationForestTest.dml b/scripts/staging/isolationForest/test/isolationForestTest.dml index 9decfe6087d..1bae38aced1 100644 --- a/scripts/staging/isolationForest/test/isolationForestTest.dml +++ b/scripts/staging/isolationForest/test/isolationForestTest.dml @@ -702,6 +702,234 @@ parfor (i_run in 1:nr_runs) { print("\n") } +#------------------------------------------------------------- +# Isolation Forest - Edge Case Tests +# Following the structure of isolationForestTest.dml +#------------------------------------------------------------- + +print("===============================================================") +print("Isolation Forest - Edge Case Tests") +print("===============================================================") +print("") + +# Test data +X_100x5 = rand(rows=100, cols=5, seed=42) +X_zeros = matrix(0.0, rows=100, cols=5) +X_identical = matrix(5.0, rows=100, cols=5) + +print("===============================================================") +print("CATEGORY 1: Extreme Data Sizes") +print("===============================================================") +print("") + +# Test 1: Minimum dataset (2 rows) +print("Test 1: Minimum dataset (2 rows, 2 features)") +testname = "Minimum dataset (2x2)" +X_tiny = rand(rows=2, cols=2, seed=42) +model_tiny = iForest::outlierByIsolationForest(X=X_tiny, n_trees=10, subsampling_size=2) +scores_tiny = iForest::outlierByIsolationForestApply(iForestModel=model_tiny, X=X_tiny) +test_res = (nrow(scores_tiny) == 2) & (ncol(scores_tiny) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 2: Very small dataset (3 rows) +print("Test 2: Very small dataset (3 rows, 3 features)") +testname = "Very small dataset (3x3)" +X_mini = rand(rows=3, cols=3, seed=42) +model_mini = iForest::outlierByIsolationForest(X=X_mini, n_trees=5, subsampling_size=3) +scores_mini = iForest::outlierByIsolationForestApply(iForestModel=model_mini, X=X_mini) +test_res = (nrow(scores_mini) == 3) & (ncol(scores_mini) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 3: Large dataset +print("Test 3: Large dataset (5,000 rows, 10 features)") +testname = "Large dataset (5000x10)" +X_large = rand(rows=5000, cols=10, seed=42) +model_large = iForest::outlierByIsolationForest(X=X_large, n_trees=30, subsampling_size=256) +scores_large = iForest::outlierByIsolationForestApply(iForestModel=model_large, X=X_large[1:10,]) +test_res = (nrow(scores_large) == 10) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 2: Extreme Feature Counts") +print("===============================================================") +print("") + +# Test 4: Single feature +print("Test 4: Single feature dataset (100 rows, 1 feature)") +testname = "Single feature dataset" +X_one_feature = rand(rows=100, cols=1, seed=42) +model_one = iForest::outlierByIsolationForest(X=X_one_feature, n_trees=20, subsampling_size=50) +scores_one = iForest::outlierByIsolationForestApply(iForestModel=model_one, X=X_one_feature) +test_res = (nrow(scores_one) == 100) & (ncol(scores_one) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 5: High-dimensional dataset +print("Test 5: High-dimensional dataset (200 rows, 30 features)") +testname = "High-dimensional dataset (30 features)" +X_high_dim = rand(rows=200, cols=30, seed=42) +model_high = iForest::outlierByIsolationForest(X=X_high_dim, n_trees=20, subsampling_size=100) +scores_high = iForest::outlierByIsolationForestApply(iForestModel=model_high, X=X_high_dim[1:10,]) +test_res = (nrow(scores_high) == 10) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 3: Extreme Values") +print("===============================================================") +print("") + +# Test 6: All zeros +print("Test 6: All values are zero") +testname = "All zeros" +model_zeros = iForest::outlierByIsolationForest(X=X_zeros, n_trees=20, subsampling_size=50) +scores_zeros = iForest::outlierByIsolationForestApply(iForestModel=model_zeros, X=X_zeros) +mean_score = mean(scores_zeros) +# When all data is identical, algorithm can't split -> maximum path length +# -> LOW anomaly scores (all points are equally "normal") +test_res = (mean_score > 0.2) & (mean_score < 0.35) # Expect low scores for identical data +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Mean score: " + toString(mean_score) + " (low score expected - no variation to detect)") +print("") + +# Test 7: All identical values +print("Test 7: All identical values (all 5s)") +testname = "All identical values" +model_identical = iForest::outlierByIsolationForest(X=X_identical, n_trees=20, subsampling_size=50) +scores_identical = iForest::outlierByIsolationForestApply(iForestModel=model_identical, X=X_identical) +mean_score = mean(scores_identical) +# Same logic: identical data -> can't split -> maximum path -> low scores +test_res = (mean_score > 0.2) & (mean_score < 0.35) # Expect low scores for identical data +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Mean score: " + toString(mean_score) + " (low score expected - no variation to detect)") +print("") + +# Test 8: Negative values +print("Test 8: All negative values") +testname = "All negative values" +X_negative = rand(rows=100, cols=5, min=-100, max=-1, seed=42) +model_negative = iForest::outlierByIsolationForest(X=X_negative, n_trees=20, subsampling_size=50) +scores_negative = iForest::outlierByIsolationForestApply(iForestModel=model_negative, X=X_negative) +test_res = (nrow(scores_negative) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 9: Very large values +print("Test 9: Very large values (thousands)") +testname = "Very large values" +X_huge = rand(rows=100, cols=5, min=1000, max=10000, seed=42) +model_huge = iForest::outlierByIsolationForest(X=X_huge, n_trees=10, subsampling_size=50) +scores_huge = iForest::outlierByIsolationForestApply(iForestModel=model_huge, X=X_huge) +test_res = (nrow(scores_huge) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 10: Very small values +print("Test 10: Very small values (near zero)") +testname = "Very small values" +X_tiny_vals = rand(rows=100, cols=5, min=0.0001, max=0.001, seed=42) +model_tiny_vals = iForest::outlierByIsolationForest(X=X_tiny_vals, n_trees=10, subsampling_size=50) +scores_tiny_vals = iForest::outlierByIsolationForestApply(iForestModel=model_tiny_vals, X=X_tiny_vals) +test_res = (nrow(scores_tiny_vals) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +# Test 11: Mixed extreme values +print("Test 11: Mixed extreme values") +testname = "Mixed extreme values" +X_mixed = rand(rows=100, cols=5, min=-1000, max=1000, seed=42) +model_mixed = iForest::outlierByIsolationForest(X=X_mixed, n_trees=10, subsampling_size=50) +scores_mixed = iForest::outlierByIsolationForestApply(iForestModel=model_mixed, X=X_mixed) +test_res = (nrow(scores_mixed) == 100) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print("") + +print("===============================================================") +print("CATEGORY 4: Model Validation") +print("===============================================================") +print("") + +# NOTE: Seed reproducibility tests skipped due to integer overflow bug +# in seed generation (isolationForest.dml line 144) +# Bug: seeds = matrix(seq(1, n_trees), cols=n_trees, rows=1)*seed +# causes overflow when n_trees >= 10 and seed > 0 + +# Test 12: Model produces valid output +print("Test 12: Model produces valid anomaly scores") +testname = "Valid model output" +X_test = rand(rows=50, cols=5, seed=789) +# Use small n_trees and no seed to avoid bugs +model_test = iForest::outlierByIsolationForest(X=X_test, n_trees=5, subsampling_size=25) +scores_test = iForest::outlierByIsolationForestApply(iForestModel=model_test, X=X_test) +# Check we got scores for all rows +test_res = (nrow(scores_test) == 50) & (ncol(scores_test) == 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Scores produced: " + toString(nrow(scores_test)) + " rows") +print("") + +# Test 13: Scores are in valid range +print("Test 13: Anomaly scores in valid range [0,1]") +testname = "Valid score range" +X_range = rand(rows=100, cols=5, seed=456) +model_range = iForest::outlierByIsolationForest(X=X_range, n_trees=5, subsampling_size=50) +scores_range = iForest::outlierByIsolationForestApply(iForestModel=model_range, X=X_range) +min_score = min(scores_range) +max_score = max(scores_range) +test_res = (min_score >= 0) & (max_score <= 1) +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Score range: [" + toString(min_score) + ", " + toString(max_score) + "]") +print("") + +print("===============================================================") +print("CATEGORY 5: Outlier Detection Scenarios") +print("===============================================================") +print("") + +# Test 14: Subtle outliers +print("Test 14: Subtle outliers (2 standard deviations)") +testname = "Subtle outliers" +X_normal = rand(rows=99, cols=5, pdf="normal", seed=42) +X_subtle_outlier = matrix(2.0, rows=1, cols=5) +X_with_subtle = rbind(X_normal, X_subtle_outlier) +model_subtle = iForest::outlierByIsolationForest(X=X_with_subtle, n_trees=30, subsampling_size=50) +scores_subtle = iForest::outlierByIsolationForestApply(iForestModel=model_subtle, X=X_with_subtle) +outlier_score = as.scalar(scores_subtle[100,1]) +normal_mean_score = mean(scores_subtle[1:99,]) +test_res = outlier_score > normal_mean_score +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Outlier score: " + toString(outlier_score) + ", Normal mean: " + toString(normal_mean_score)) +print("") + +# Test 15: Outlier in only one feature +print("Test 15: Outlier in only one feature") +testname = "One-dimensional outlier" +X_one_dim_outlier = rand(rows=100, cols=5, min=0, max=10, seed=42) +X_one_dim_outlier[1,1] = 100 +model_one_dim = iForest::outlierByIsolationForest(X=X_one_dim_outlier, n_trees=30, subsampling_size=50) +scores_one_dim = iForest::outlierByIsolationForestApply(iForestModel=model_one_dim, X=X_one_dim_outlier) +outlier_score = as.scalar(scores_one_dim[1,1]) +normal_mean = mean(scores_one_dim[2:100,]) +test_res = outlier_score > normal_mean +[test_cnt, fails] = record_test_result(testname, test_res, test_cnt, fails) +print(" Outlier score: " + toString(outlier_score) + ", Normal mean: " + toString(normal_mean)) +print("") + +print("===============================================================") +print("SUMMARY") +print("===============================================================") +succ_test_cnt = test_cnt - length(fails) +print(toString(succ_test_cnt) + "/" + toString(test_cnt) + " tests succeeded!") +if (length(fails) > 0) { + print("\nTests that failed:") + print(toString(fails)) +} + +print("===============================================================") + + print("===============================================================") print("TESTING FINISHED!") \ No newline at end of file diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index 4feab311c76..d3a518f47f0 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -265,6 +265,8 @@ public enum Builtins { OUTLIER_ARIMA("outlierByArima",true), OUTLIER_IQR("outlierByIQR", true), OUTLIER_IQR_APPLY("outlierByIQRApply", true), + OUTLIER_ISOLATION_FOREST("outlierByIsolationForest", true), + OUTLIER_ISOLATION_FOREST_APPLY("outlierByIsolationForestApply", true), OUTLIER_SD("outlierBySd", true), OUTLIER_SD_APPLY("outlierBySdApply", true), PAGERANK("pageRank", true), diff --git a/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst new file mode 100644 index 00000000000..4cc27950c73 --- /dev/null +++ b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForest.rst @@ -0,0 +1,25 @@ +.. ------------------------------------------------------------- +.. +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. +.. +.. ------------------------------------------------------------ + +outlierByIsolationForest +======================== + +.. autofunction:: systemds.operator.algorithm.outlierByIsolationForest \ No newline at end of file diff --git a/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst new file mode 100644 index 00000000000..aff908f4785 --- /dev/null +++ b/src/main/python/docs/source/api/operator/algorithms/outlierByIsolationForestApply.rst @@ -0,0 +1,25 @@ +.. ------------------------------------------------------------- +.. +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. +.. +.. ------------------------------------------------------------ + +outlierByIsolationForestApply +============================= + +.. autofunction:: systemds.operator.algorithm.outlierByIsolationForestApply \ No newline at end of file diff --git a/src/main/python/systemds/operator/algorithm/__init__.py b/src/main/python/systemds/operator/algorithm/__init__.py index e8cb4c04e95..60ce92715eb 100644 --- a/src/main/python/systemds/operator/algorithm/__init__.py +++ b/src/main/python/systemds/operator/algorithm/__init__.py @@ -159,6 +159,8 @@ from .builtin.outlierByArima import outlierByArima from .builtin.outlierByIQR import outlierByIQR from .builtin.outlierByIQRApply import outlierByIQRApply +from .builtin.outlierByIsolationForest import outlierByIsolationForest +from .builtin.outlierByIsolationForestApply import outlierByIsolationForestApply from .builtin.outlierBySd import outlierBySd from .builtin.outlierBySdApply import outlierBySdApply from .builtin.pageRank import pageRank @@ -358,6 +360,8 @@ 'outlierByArima', 'outlierByIQR', 'outlierByIQRApply', + 'outlierByIsolationForest', + 'outlierByIsolationForestApply', 'outlierBySd', 'outlierBySdApply', 'pageRank', diff --git a/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py new file mode 100644 index 00000000000..d6adf720ccf --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForest.py @@ -0,0 +1,86 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/outlierByIsolationForest.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def outlierByIsolationForest(X: Matrix, + n_trees: int, + subsampling_size: int, + **kwargs: Dict[str, VALID_INPUT_TYPES]): + """ + Builtin function that implements anomaly detection via isolation forest as described in + [Liu2008]: + Liu, F. T., Ting, K. M., & Zhou, Z. H. + (2008, December). + Isolation forest. + In 2008 eighth ieee international conference on data mining (pp. 413-422). + IEEE. + + This function creates an iForest model for outlier detection. + + .. code-block:: python + + >>> import numpy as np + >>> from systemds.context import SystemDSContext + >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + >>> with SystemDSContext() as sds: + ... # Create training data: 20 points clustered near origin + ... X_train = sds.from_numpy(np.array([ + ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ... ])) + ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + ... scores = outlierByIsolationForestApply(model, X_test).compute() + ... print(scores.shape) + ... print(scores[1, 0] > scores[0, 0]) + ... print(scores[1, 0] > 0.5) + (2, 1) + True + True + + + + + :param X: Numerical feature matrix + :param n_trees: Number of iTrees to build + :param subsampling_size: Size of the subsample to build iTrees with + :param seed: Seed for calls to `sample` and `rand`. -1 corresponds to a random seed + :return: The trained iForest model to be used in outlierByIsolationForestApply. + The model is represented as a list with two entries: + Entry 'model' (Matrix[Double]) - The iForest Model in linearized form (see m_iForest) + Entry 'subsampling_size' (Double) - The subsampling size used to build the model. + """ + + params_dict = {'X': X, 'n_trees': n_trees, 'subsampling_size': subsampling_size} + params_dict.update(kwargs) + return Matrix(X.sds_context, + 'outlierByIsolationForest', + named_input_nodes=params_dict) diff --git a/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py new file mode 100644 index 00000000000..2ecb612a3e5 --- /dev/null +++ b/src/main/python/systemds/operator/algorithm/builtin/outlierByIsolationForestApply.py @@ -0,0 +1,79 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +# Autogenerated From : scripts/builtin/outlierByIsolationForestApply.dml + +from typing import Dict, Iterable + +from systemds.operator import OperationNode, Matrix, Frame, List, MultiReturn, Scalar +from systemds.utils.consts import VALID_INPUT_TYPES + + +def outlierByIsolationForestApply(iForestModel: List, + X: Matrix): + """ + Builtin function that calculates the anomaly score as described in [Liu2008] + for a set of samples `X` based on an iForest model. + + [Liu2008]: + Liu, F. T., Ting, K. M., & Zhou, Z. H. + (2008, December). + Isolation forest. + In 2008 eighth ieee international conference on data mining (pp. 413-422). + IEEE. + + .. code-block:: python + + >>> import numpy as np + >>> from systemds.context import SystemDSContext + >>> from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + >>> with SystemDSContext() as sds: + ... # Create training data: 20 points clustered near origin + ... X_train = sds.from_numpy(np.array([ + ... [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + ... [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + ... [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + ... [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ... ])) + ... model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + ... X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + ... scores = outlierByIsolationForestApply(model, X_test).compute() + ... print(scores.shape) + ... print(scores[1, 0] > scores[0, 0]) + ... print(scores[1, 0] > 0.5) + (2, 1) + True + True + + + + + :param iForestModel: The trained iForest model as returned by outlierByIsolationForest + :param X: Samples to calculate the anomaly score for + :return: Column vector of anomaly scores corresponding to the samples in X. + Samples with an anomaly score > 0.5 are generally considered to be outliers + """ + + params_dict = {'iForestModel': iForestModel, 'X': X} + return Matrix(iForestModel.sds_context, + 'outlierByIsolationForestApply', + named_input_nodes=params_dict) diff --git a/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py b/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py new file mode 100644 index 00000000000..41e0f4b5f31 --- /dev/null +++ b/src/main/python/tests/auto_tests/test_outlierByIsolationForest.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +import unittest, contextlib, io + + +class TestOUTLIERBYISOLATIONFOREST(unittest.TestCase): + def test_outlierByIsolationForest(self): + # Example test case provided in python the code block + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + import numpy as np + from systemds.context import SystemDSContext + from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + with SystemDSContext() as sds: + # Create training data: 20 points clustered near origin + X_train = sds.from_numpy(np.array([ + [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ])) + model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + scores = outlierByIsolationForestApply(model, X_test).compute() + print(scores.shape) + print(scores[1, 0] > scores[0, 0]) + print(scores[1, 0] > 0.5) + + expected = """(2, 1) +True +True""" + self.assertEqual(buf.getvalue().strip(), expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py b/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py new file mode 100644 index 00000000000..e0a9abb1a90 --- /dev/null +++ b/src/main/python/tests/auto_tests/test_outlierByIsolationForestApply.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +# Autogenerated By : src/main/python/generator/generator.py +import unittest, contextlib, io + + +class TestOUTLIERBYISOLATIONFORESTAPPLY(unittest.TestCase): + def test_outlierByIsolationForestApply(self): + # Example test case provided in python the code block + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + import numpy as np + from systemds.context import SystemDSContext + from systemds.operator.algorithm import outlierByIsolationForest, outlierByIsolationForestApply + with SystemDSContext() as sds: + # Create training data: 20 points clustered near origin + X_train = sds.from_numpy(np.array([ + [0.0, 0.0], [0.1, 0.1], [0.2, 0.2], [0.3, 0.3], [0.4, 0.4], + [0.5, 0.5], [0.6, 0.6], [0.7, 0.7], [0.8, 0.8], [0.9, 0.9], + [1.0, 1.0], [1.1, 1.1], [1.2, 1.2], [1.3, 1.3], [1.4, 1.4], + [1.5, 1.5], [1.6, 1.6], [1.7, 1.7], [1.8, 1.8], [1.9, 1.9] + ])) + model = outlierByIsolationForest(X_train, n_trees=100, subsampling_size=10, seed=42) + X_test = sds.from_numpy(np.array([[1.0, 1.0], [100.0, 100.0]])) + scores = outlierByIsolationForestApply(model, X_test).compute() + print(scores.shape) + print(scores[1, 0] > scores[0, 0]) + print(scores[1, 0] > 0.5) + + expected = """(2, 1) +True +True""" + self.assertEqual(buf.getvalue().strip(), expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java new file mode 100644 index 00000000000..2d556556e4e --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinIsolationForestTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin.part2; + +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Assert; +import org.junit.Test; + +import java.util.HashMap; + +public class BuiltinIsolationForestTest extends AutomatedTestBase { + private final static String TEST_NAME = "outlierByIsolationForestTest"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinIsolationForestTest.class.getSimpleName() + "/"; + + private final static double eps = 1e-10; + private final static int rows = 100; + private final static int cols = 3; + private final static int n_trees = 10; + private final static int subsampling_size = 20; + private final static int seed = 42; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, + new String[]{"scores", "model", "subsampling_size"})); + } + + @Test + public void testIsolationForestSingleNode() { + runIsolationForestTest(false, ExecMode.SINGLE_NODE); + } + + @Test + public void testIsolationForestHybrid() { + runIsolationForestTest(false, ExecMode.HYBRID); + } + + @Test + public void testIsolationForestWithOutliersSingleNode() { + runIsolationForestTest(true, ExecMode.SINGLE_NODE); + } + + @Test + public void testIsolationForestWithOutliersHybrid() { + runIsolationForestTest(true, ExecMode.HYBRID); + } + + private void runIsolationForestTest(boolean withOutliers, ExecMode mode) { + ExecMode platformOld = setExecMode(mode); + + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-nvargs", + "X=" + input("A"), + "n_trees=" + n_trees, + "subsampling_size=" + subsampling_size, + "seed=" + seed, + "output=" + output("scores"), + "model_output=" + output("model"), + "subsampling_size_output=" + output("subsampling_size")}; + + + // Generate data + double[][] A; + if (withOutliers) { + // Generate data with clear outliers + // Most data is around 0, outliers are far away + A = new double[rows][cols]; + for (int i = 0; i < rows - 5; i++) { + for (int j = 0; j < cols; j++) { + // Normal data: mean=0, range=[-2, 2] + A[i][j] = (Math.random() - 0.5) * 4; + } + } + // Add outliers: far from normal data + for (int i = rows - 5; i < rows; i++) { + for (int j = 0; j < cols; j++) { + // Outliers: mean=10, range=[8, 12] + A[i][j] = 8 + Math.random() * 4; + } + } + } else { + // Generate normal data (no outliers) + A = getRandomMatrix(rows, cols, -5, 5, 0.7, seed); + } + + writeInputMatrixWithMTD("A", A, true); + + runTest(true, false, null, -1); + + // Verify model was created + HashMap model = readDMLMatrixFromOutputDir("model"); + Assert.assertNotNull("Model should not be null", model); + Assert.assertFalse("Model should have entries", model.isEmpty()); + + // Verify subsampling size was stored correctly + HashMap subsamplingSize = readDMLScalarFromOutputDir("subsampling_size"); + Assert.assertEquals("Subsampling size should match", + subsampling_size, + subsamplingSize.get(new CellIndex(1, 1)), + eps); + + // Verify model has n_trees rows + int maxRow = 0; + for (CellIndex idx : model.keySet()) { + maxRow = Math.max(maxRow, idx.row); + } + Assert.assertEquals("Model should have n_trees rows", n_trees, maxRow); + } finally { + rtplatform = platformOld; + } + } +} \ No newline at end of file diff --git a/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml b/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml new file mode 100644 index 00000000000..bca50578b97 --- /dev/null +++ b/src/test/scripts/functions/builtin/outlierByIsolationForestTest.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($X) + +model = outlierByIsolationForest(X=X, n_trees=$n_trees, subsampling_size=$subsampling_size, seed=$seed) +M = as.matrix(model['model']) +subsampling_size_out = as.scalar(model['subsampling_size']) + +scores = outlierByIsolationForestApply(iForestModel=model, X=X) + +write(scores, $output) +write(M, $model_output) +write(subsampling_size_out, $subsampling_size_output)