`%>%` <- magrittr::`%>%` #' @useDynLib strats strats_compute_laplacian_matrix_product strats_laplacian_matrix_product <- function (temporal_dependency, X) { out <- matrix (0, nrow (X), ncol (X)) n <- as.integer (nrow (X)) d <- as.integer (ncol (X)) temporal_dependency <- as.numeric (temporal_dependency) omega <- as.integer (length (temporal_dependency)) x <- as.numeric (X) .C (strats_compute_laplacian_matrix_product, n, d, omega, temporal_dependency, x, out)[[6]] } ## Check the matrix product check_matrix_product <- function () { X <- matrix (rnorm (24), 6, 4) temporal_dependency <- c (0.5, 0.2, 0.1) M <- matrix (0, 6, 6) for (omega in 0:3) { mij <- 1 if (omega > 0) { mij <- temporal_dependency[omega] } for (i in 1:6) { j_plus <- i + omega j_minus <- i - omega if (j_plus <= 6) { M[i, j_plus] <- mij } if (j_minus >= 1) { M[i, j_minus] <- mij } } } D <- diag (rowSums (M), 6, 6) L <- D - M Y <- L %*% X Yhat <- strats_laplacian_matrix_product (temporal_dependency, X) stopifnot (nrow (Yhat) == nrow (Y)) stopifnot (ncol (Yhat) == ncol (Y)) stopifnot (max (abs (Yhat - Y)) < 1e-8) } #' Compute the STRATS loss #' #' @param T_normal a normal subset of the time series #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param W the reconstruction matrix #' @param R the residual matrix #' @return the loss #' @export strats_loss <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) { non_residual <- R - T (sum ((T - crossprod (W, T_normal) - R) ^ 2) + alpha * sum (sqrt (rowSums (W ^ 2))) + beta * sum (sqrt (rowSums (R ^ 2))) + gamma * sum (non_residual * strats_laplacian_matrix_product (dependency, non_residual))) } #' Compute the STRATS gradient wrt W #' #' @param T_normal a normal subset of the time series #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param W the reconstruction matrix #' @param R the residual matrix #' @return the gradient wrt W #' @export strats_gradient_w <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) { Dw <- 1 / (2 * sqrt (rowSums (W ^ 2))) Dw[!is.finite (Dw)] <- 0 M <- tcrossprod (T_normal) diag (M) <- diag (M) + alpha * Dw 2 * (M %*% W - tcrossprod (T_normal, T) + tcrossprod (T_normal, R)) } #' Compute the STRATS gradient wrt R #' #' @param T_normal a normal subset of the time series #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param W the reconstruction matrix #' @param R the residual matrix #' @return the gradient wrt R #' @export strats_gradient_r <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) { Dr <- 1 / (2 * sqrt (rowSums (R ^ 2))) Dr[!is.finite (Dr)] <- 0 2 * (crossprod (W, T_normal) - T + R + beta * sweep (R, 1, Dr, `*`) + gamma * strats_laplacian_matrix_product (dependency, R - T)) } gradient_error <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) { gW <- matrix (0, nrow (T_normal), nrow (T)) gR <- matrix (0, nrow (T), ncol (T)) for (i in 1:nrow (T_normal)) { for (j in 1:nrow (T)) { W_minus <- W W_plus <- W W_minus[i, j] <- W_minus[i, j] - 1e-4 W_plus[i, j] <- W_plus[i, j] + 1e-4 l_minus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W_minus, R) l_plus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W_plus, R) gW[i, j] <- (l_plus - l_minus) / (2 * 1e-4) } } for (i in 1:nrow (T)) { for (j in 1:ncol (T)) { R_minus <- R R_plus <- R R_minus[i, j] <- R_minus[i, j] - 1e-4 R_plus[i, j] <- R_plus[i, j] + 1e-4 l_minus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R_minus) l_plus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R_plus) gR[i, j] <- (l_plus - l_minus) / (2 * 1e-4) } } gW_true <- strats_gradient_w (T_normal, T, alpha, beta, gamma, dependency, W, R) gR_true <- strats_gradient_r (T_normal, T, alpha, beta, gamma, dependency, W, R) max (max (abs (gW - gW_true)), max (abs (gR - gR_true))) } check_gradient <- function () { set.seed (1) for (i in 1:100) { n <- 100 n_train <- 25 d <- 10 dependency <- c (0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001) T_normal <- matrix (rnorm (n_train * d), n_train, d) T <- matrix (rnorm (n * d), n, d) W <- matrix (rnorm (n_train * n), n_train, n) R <- matrix (rnorm (n * d), n, d) stopifnot (gradient_error (T_normal, T, 1000, 1000, 1000, dependency, W, R) <= 1e-4) } } #' Check that the step size is adapted. #' #' @param T_normal a normal subset of the time series #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param W the reconstruction matrix #' @param R the residual matrix #' @param gW the gradient wrt W at W #' @param gR the gradient wrt R at R #' @param step_w the step size for W (may be 0 to only update R) #' @param step_r the step size for R (may be 0 to only update W) #' @param loss_value the loss value at (W, R) #' @return whether the step sizes for W and R are correct #' @export armijo <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_w, step_r, loss_value) { nxt_w <- W - step_w * gW nxt_r <- R - step_r * gR loss_next <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, nxt_w, nxt_r) loss_limit <- (loss_value + sum ((nxt_w - W) * gW) + sum ((nxt_r - R) * gR) + ifelse (step_w == 0, 0, 0.5 * sum ((nxt_w - W) ^ 2) / step_w) + ifelse (step_r == 0, 0, 0.5 * sum ((nxt_r - R) ^ 2) / step_r)) (loss_next <= loss_limit) } line_search_w <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, loss_value) { if (armijo (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, 0, loss_value)) { step } else { cat (sprintf ("%g is not sufficient for step_w.\n", step), file = stderr ()) line_search_w (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step / 2, loss_value) } } line_search_r <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, loss_value) { if (armijo (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, 0, step, loss_value)) { step } else { cat (sprintf ("%g is not sufficient for step_r.\n", step), file = stderr ()) line_search_r (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step / 2, loss_value) } } strats_iteration <- function (T_normal, T, alpha, beta, gamma, dependency, loss_value = NULL, W = NULL, R = NULL, step_w = NULL, step_r = NULL) { if (is.null (W)) { W <- diag (1, nrow (T_normal), nrow (T)) row.names (W) <- row.names (T_normal) colnames (W) <- row.names (T) } if (is.null (R)) { R <- matrix (0, nrow (T), ncol (T)) row.names (R) <- row.names (T) colnames (R) <- colnames (T) } if (is.null (loss_value)) { loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R) } if (is.null (step_w)) { step_w <- 1 } if (is.null (step_r)) { step_r <- 1 } ## Optimize W: gW <- strats_gradient_w (T_normal, T, alpha, beta, gamma, dependency, W, R) gR <- matrix (0, nrow (T), ncol (T)) step_w <- line_search_w (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_w, loss_value) cat (sprintf ("Using step W: %g\n", step_w), file = stderr ()) W <- W - step_w * gW loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R) ## Optimize R: gW <- matrix (0, nrow (T_normal), nrow (T)) gR <- strats_gradient_r (T_normal, T, alpha, beta, gamma, dependency, W, R) step_r <- line_search_r (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_r, loss_value) cat (sprintf ("Using step R: %g\n", step_r), file = stderr ()) R <- R - step_r * gR loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R) list (loss_value = loss_value, W = W, R = R, step_w = step_w, step_r = step_r) } strats_sample <- function (T, n_points) { i <- ceiling (nrow (T) * ((1:n_points) / n_points)) T[i,, drop = FALSE] } #' Apply the STRATS algorithm. #' #' The algorithm uses a normal time series, it is either passed as #' T_normal, or sampled from T with n_normal points. #' #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param T_normal a time series with normal data #' @param n_normal the number of normal points to sample #' @param maxiter how many gradient descent iterations to make #' @return the STRATS model, as a list with key 'anomalies' bound to a #' tibble with 'observation' (the index in T), 'score' (the #' anomaly score) and 'rank' (the anomaly rank). #' @export strats <- function (T, alpha, beta, gamma, dependency, T_normal = NULL, n_normal = NULL, maxiter = 100) { if (is.null (T_normal) && is.null (n_normal)) { stop ("Either pas n_normal, the number of samples to take for normal, or pass T_normal") } if (is.null (T_normal)) { T_normal <- strats_sample (T, n_normal) } T <- as.matrix (T) T_normal <- as.matrix (T_normal) dependency <- c (dependency) stopifnot (is.finite (alpha)) stopifnot (is.finite (beta)) stopifnot (is.finite (gamma)) stopifnot (alpha >= 0) stopifnot (beta >= 0) stopifnot (gamma >= 0) stopifnot (all (is.finite (dependency))) stopifnot (all (dependency >= 0)) stopifnot (all (is.finite (T))) stopifnot (all (is.finite (T_normal))) stopifnot (ncol (T) == ncol (T_normal)) model <- list () for (i in 1:maxiter) { last_loss <- model$loss_value model <- strats_iteration (T_normal, T, alpha, beta, gamma, dependency, model$loss_value, model$W, model$R, model$step_w, model$step_r) cat (sprintf ("New model with loss value: %g.\n", model$loss_value), file = stderr ()) if (!is.null (last_loss)) { stopifnot (model$loss_value <= last_loss) } } model$anomalies <- (tibble::tibble (observation = seq_len (nrow (model$R)), score = rowSums (model$R ^ 2)) %>% dplyr::arrange (-score) %>% dplyr::mutate (rank = seq_len (nrow (model$R)))) model } #' Test a configuration for STRATS with synthetic anomaly injection #' #' @param T the time series #' @param alpha the global detection regularizer #' @param beta the residual analysis regularizer #' @param gamma the contextual detection regularizer #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param T_normal a time series with normal data #' @param n_normal the number of normal points to sample #' @param maxiter how many gradient descent iterations to make #' @return the mean rank of the injected anomalies. Lower is better. #' @export strats_test <- function (T, alpha, beta, gamma, dependency, T_normal = NULL, n_normal = NULL, maxiter = 100) { ## Inject 1 anomaly, learn STRATS, and check the rank of the ## anomaly. Repeat 10 times with different anomalies. Return the ## mean. do.call (mean, lapply (1:10, function (seed) { set.seed (seed) i_base <- sample (seq_len (nrow (T)), 1) i_anomaly <- sample (seq_len (nrow (T)), 1) which_swapped <- sample (seq_len (ncol (T)), ceiling (ncol (T) / 2)) t <- T[i_anomaly,] t_base <- T[i_base,] t[which_swapped] <- t_base[which_swapped] T[i_anomaly,] <- t model <- strats (T, alpha, beta, gamma, dependency, T_normal = T_normal, n_normal = n_normal, maxiter = maxiter) cat (sprintf ("WTF?? %s\n", paste (colnames (model$anomalies), collapse = "\n")), file = stderr ()) (model$anomalies %>% dplyr::filter (observation == i_anomaly))$rank[1] })) } #' Tune STRATS with synthetic anomalies #' #' @param T the time series #' @param dependency a vector giving the temporal dependency between #' an observation and the next observations (the temporal #' dependency is symmetric) #' @param T_normal a time series with normal data #' @param n_normal the number of normal points to sample #' @param maxiter how many gradient descent iterations to make #' @return a tibble, with columns alpha, beta, gamma, and error (the error #' on synthetic anomalies). #' @export strats_tune <- function (T, dependency, T_normal = NULL, n_normal = NULL, maxiter = 100) { set.seed (1) do.call (rbind, lapply (1:20, function (seed) { set.seed (seed) alpha <- signif (10^runif (1, -3, 3), 1) beta <- signif (10^runif (1, -3, 3), 1) gamma <- signif (10^runif (1, -3, 3), 1) error <- strats_test (T, alpha, beta, gamma, dependency, T_normal = T_normal, n_normal = n_normal, maxiter = maxiter) cat (sprintf ("Alpha: %g, beta: %g, gamma: %g, error: %g\n", alpha, beta, gamma, error), file = stderr ()) tibble::tibble (alpha = alpha, beta = beta, gamma = gamma, error = error) })) }