`%>%` <- magrittr::`%>%`

#' @useDynLib strats strats_compute_laplacian_matrix_product
strats_laplacian_matrix_product <- function (temporal_dependency, X) {
    out <- matrix (0, nrow (X), ncol (X))
    n <- as.integer (nrow (X))
    d <- as.integer (ncol (X))
    temporal_dependency <- as.numeric (temporal_dependency)
    omega <- as.integer (length (temporal_dependency))
    x <- as.numeric (X)
    .C (strats_compute_laplacian_matrix_product,
        n, d, omega, temporal_dependency, x, out)[[6]]
}

## Check the matrix product
check_matrix_product <- function () {
    X <- matrix (rnorm (24), 6, 4)
    temporal_dependency <- c (0.5, 0.2, 0.1)
    M <- matrix (0, 6, 6)
    for (omega in 0:3) {
        mij <- 1
        if (omega > 0) {
            mij <- temporal_dependency[omega]
        }
        for (i in 1:6) {
            j_plus <- i + omega
            j_minus <- i - omega
            if (j_plus <= 6) {
                M[i, j_plus] <- mij
            }
            if (j_minus >= 1) {
                M[i, j_minus] <- mij
            }
        }
    }
    D <- diag (rowSums (M), 6, 6)
    L <- D - M
    Y <- L %*% X
    Yhat <- strats_laplacian_matrix_product (temporal_dependency, X)
    stopifnot (nrow (Yhat) == nrow (Y))
    stopifnot (ncol (Yhat) == ncol (Y))
    stopifnot (max (abs (Yhat - Y)) < 1e-8)
}

#' Compute the STRATS loss
#'
#' @param T_normal a normal subset of the time series
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param W the reconstruction matrix
#' @param R the residual matrix
#' @return the loss
#' @export
strats_loss <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) {
    non_residual <- R - T
    (sum ((T - crossprod (W, T_normal) - R) ^ 2)
        + alpha * sum (sqrt (rowSums (W ^ 2)))
        + beta * sum (sqrt (rowSums (R ^ 2)))
        + gamma * sum (non_residual * strats_laplacian_matrix_product (dependency, non_residual)))
}

#' Compute the STRATS gradient wrt W
#'
#' @param T_normal a normal subset of the time series
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param W the reconstruction matrix
#' @param R the residual matrix
#' @return the gradient wrt W
#' @export
strats_gradient_w <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) {
    Dw <- 1 / (2 * sqrt (rowSums (W ^ 2)))
    Dw[!is.finite (Dw)] <- 0
    M <- tcrossprod (T_normal)
    diag (M) <- diag (M) + alpha * Dw
    2 * (M %*% W - tcrossprod (T_normal, T) + tcrossprod (T_normal, R))
}

#' Compute the STRATS gradient wrt R
#'
#' @param T_normal a normal subset of the time series
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param W the reconstruction matrix
#' @param R the residual matrix
#' @return the gradient wrt R
#' @export
strats_gradient_r <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) {
    Dr <- 1 / (2 * sqrt (rowSums (R ^ 2)))
    Dr[!is.finite (Dr)] <- 0
    2 * (crossprod (W, T_normal) - T + R + beta * sweep (R, 1, Dr, `*`) + gamma * strats_laplacian_matrix_product (dependency, R - T))
}

gradient_error <- function (T_normal, T, alpha, beta, gamma, dependency, W, R) {
    gW <- matrix (0, nrow (T_normal), nrow (T))
    gR <- matrix (0, nrow (T), ncol (T))
    for (i in 1:nrow (T_normal)) {
        for (j in 1:nrow (T)) {
            W_minus <- W
            W_plus <- W
            W_minus[i, j] <- W_minus[i, j] - 1e-4
            W_plus[i, j] <- W_plus[i, j] + 1e-4
            l_minus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency,
                                    W_minus, R)
            l_plus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency,
                                   W_plus, R)
            gW[i, j] <- (l_plus - l_minus) / (2 * 1e-4)
        }
    }
    for (i in 1:nrow (T)) {
        for (j in 1:ncol (T)) {
            R_minus <- R
            R_plus <- R
            R_minus[i, j] <- R_minus[i, j] - 1e-4
            R_plus[i, j] <- R_plus[i, j] + 1e-4
            l_minus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency,
                                    W, R_minus)
            l_plus <- strats_loss (T_normal, T, alpha, beta, gamma, dependency,
                                   W, R_plus)
            gR[i, j] <- (l_plus - l_minus) / (2 * 1e-4)
        }
    }
    gW_true <- strats_gradient_w (T_normal, T, alpha, beta, gamma, dependency, W, R)
    gR_true <- strats_gradient_r (T_normal, T, alpha, beta, gamma, dependency, W, R)
    max (max (abs (gW - gW_true)), max (abs (gR - gR_true)))
}

check_gradient <- function () {
    set.seed (1)
    for (i in 1:100) {
        n <- 100
        n_train <- 25
        d <- 10
        dependency <- c (0.5, 0.2, 0.1, 0.05, 0.02, 0.01, 0.005, 0.002, 0.001)
        T_normal <- matrix (rnorm (n_train * d), n_train, d)
        T <- matrix (rnorm (n * d), n, d)
        W <- matrix (rnorm (n_train * n), n_train, n)
        R <- matrix (rnorm (n * d), n, d)
        stopifnot (gradient_error (T_normal, T, 1000, 1000, 1000, dependency, W, R)
                   <= 1e-4)
    }
}

#' Check that the step size is adapted.
#'
#' @param T_normal a normal subset of the time series
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param W the reconstruction matrix
#' @param R the residual matrix
#' @param gW the gradient wrt W at W
#' @param gR the gradient wrt R at R
#' @param step_w the step size for W (may be 0 to only update R)
#' @param step_r the step size for R (may be 0 to only update W)
#' @param loss_value the loss value at (W, R)
#' @return whether the step sizes for W and R are correct
#' @export
armijo <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_w, step_r, loss_value) {
    nxt_w <- W - step_w * gW
    nxt_r <- R - step_r * gR
    loss_next <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, nxt_w, nxt_r)
    loss_limit <- (loss_value
        + sum ((nxt_w - W) * gW)
        + sum ((nxt_r - R) * gR)
        + ifelse (step_w == 0, 0, 0.5 * sum ((nxt_w - W) ^ 2) / step_w)
        + ifelse (step_r == 0, 0, 0.5 * sum ((nxt_r - R) ^ 2) / step_r))
    (loss_next <= loss_limit)
}

line_search_w <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, loss_value) {
    if (armijo (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, 0, loss_value)) {
        step
    } else {
        cat (sprintf ("%g is not sufficient for step_w.\n", step), file = stderr ())
        line_search_w (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step / 2, loss_value)
    }
}

line_search_r <- function (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step, loss_value) {
    if (armijo (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, 0, step, loss_value)) {
        step
    } else {
        cat (sprintf ("%g is not sufficient for step_r.\n", step), file = stderr ())
        line_search_r (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step / 2, loss_value)
    }
}

strats_iteration <- function (T_normal, T, alpha, beta, gamma, dependency,
                              loss_value = NULL,
                              W = NULL, R = NULL, step_w = NULL, step_r = NULL) {
    if (is.null (W)) {
        W <- diag (1, nrow (T_normal), nrow (T))
        row.names (W) <- row.names (T_normal)
        colnames (W) <- row.names (T)
    }
    if (is.null (R)) {
        R <- matrix (0, nrow (T), ncol (T))
        row.names (R) <- row.names (T)
        colnames (R) <- colnames (T)
    }
    if (is.null (loss_value)) {
        loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R)
    }
    if (is.null (step_w)) {
        step_w <- 1
    }
    if (is.null (step_r)) {
        step_r <- 1
    }
    ## Optimize W:
    gW <- strats_gradient_w (T_normal, T, alpha, beta, gamma, dependency, W, R)
    gR <- matrix (0, nrow (T), ncol (T))
    step_w <- line_search_w (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_w, loss_value)
    cat (sprintf ("Using step W: %g\n", step_w), file = stderr ())
    W <- W - step_w * gW
    loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R)
    ## Optimize R:
    gW <- matrix (0, nrow (T_normal), nrow (T))
    gR <- strats_gradient_r (T_normal, T, alpha, beta, gamma, dependency, W, R)
    step_r <- line_search_r (T_normal, T, alpha, beta, gamma, dependency, W, R, gW, gR, step_r, loss_value)
    cat (sprintf ("Using step R: %g\n", step_r), file = stderr ())
    R <- R - step_r * gR
    loss_value <- strats_loss (T_normal, T, alpha, beta, gamma, dependency, W, R)
    list (loss_value = loss_value,
          W = W, R = R, step_w = step_w, step_r = step_r)
}

strats_sample <- function (T, n_points) {
    i <- ceiling (nrow (T) * ((1:n_points) / n_points))
    T[i,, drop = FALSE]
}

#' Apply the STRATS algorithm.
#'
#' The algorithm uses a normal time series, it is either passed as
#' T_normal, or sampled from T with n_normal points.
#'
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param T_normal a time series with normal data
#' @param n_normal the number of normal points to sample
#' @param maxiter how many gradient descent iterations to make
#' @return the STRATS model, as a list with key 'anomalies' bound to a
#'     tibble with 'observation' (the index in T), 'score' (the
#'     anomaly score) and 'rank' (the anomaly rank).
#' @export
strats <- function (T, alpha, beta, gamma, dependency,
                    T_normal = NULL,
                    n_normal = NULL,
                    maxiter = 100) {
    if (is.null (T_normal)
        && is.null (n_normal)) {
        stop ("Either pas n_normal, the number of samples to take for normal, or pass T_normal")
    }
    if (is.null (T_normal)) {
        T_normal <- strats_sample (T, n_normal)
    }
    T <- as.matrix (T)
    T_normal <- as.matrix (T_normal)
    dependency <- c (dependency)
    stopifnot (is.finite (alpha))
    stopifnot (is.finite (beta))
    stopifnot (is.finite (gamma))
    stopifnot (alpha >= 0)
    stopifnot (beta >= 0)
    stopifnot (gamma >= 0)
    stopifnot (all (is.finite (dependency)))
    stopifnot (all (dependency >= 0))
    stopifnot (all (is.finite (T)))
    stopifnot (all (is.finite (T_normal)))
    stopifnot (ncol (T) == ncol (T_normal))
    model <- list ()
    for (i in 1:maxiter) {
        last_loss <- model$loss_value
        model <- strats_iteration (T_normal, T, alpha, beta, gamma, dependency,
                                   model$loss_value,
                                   model$W, model$R, model$step_w, model$step_r)
        cat (sprintf ("New model with loss value: %g.\n", model$loss_value),
             file = stderr ())
        if (!is.null (last_loss)) {
            stopifnot (model$loss_value <= last_loss)
        }
    }
    model$anomalies <- (tibble::tibble (observation = seq_len (nrow (model$R)),
                                        score = rowSums (model$R ^ 2))
        %>% dplyr::arrange (-score)
        %>% dplyr::mutate (rank = seq_len (nrow (model$R))))
    model
}

#' Test a configuration for STRATS with synthetic anomaly injection
#'
#' @param T the time series
#' @param alpha the global detection regularizer
#' @param beta the residual analysis regularizer
#' @param gamma the contextual detection regularizer
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param T_normal a time series with normal data
#' @param n_normal the number of normal points to sample
#' @param maxiter how many gradient descent iterations to make
#' @return the mean rank of the injected anomalies. Lower is better.
#' @export
strats_test <- function (T, alpha, beta, gamma, dependency,
                         T_normal = NULL,
                         n_normal = NULL,
                         maxiter = 100) {
    ## Inject 1 anomaly, learn STRATS, and check the rank of the
    ## anomaly. Repeat 10 times with different anomalies. Return the
    ## mean.
    do.call (mean, lapply (1:10, function (seed) {
        set.seed (seed)
        i_base <- sample (seq_len (nrow (T)), 1)
        i_anomaly <- sample (seq_len (nrow (T)), 1)
        which_swapped <- sample (seq_len (ncol (T)), ceiling (ncol (T) / 2))
        t <- T[i_anomaly,]
        t_base <- T[i_base,]
        t[which_swapped] <- t_base[which_swapped]
        T[i_anomaly,] <- t
        model <- strats (T, alpha, beta, gamma, dependency,
                         T_normal = T_normal, n_normal = n_normal,
                         maxiter = maxiter)
        cat (sprintf ("WTF?? %s\n",
                      paste (colnames (model$anomalies), collapse = "\n")),
             file = stderr ())
        (model$anomalies
            %>% dplyr::filter (observation == i_anomaly))$rank[1]
    }))
}

#' Tune STRATS with synthetic anomalies
#'
#' @param T the time series
#' @param dependency a vector giving the temporal dependency between
#'     an observation and the next observations (the temporal
#'     dependency is symmetric)
#' @param T_normal a time series with normal data
#' @param n_normal the number of normal points to sample
#' @param maxiter how many gradient descent iterations to make
#' @return a tibble, with columns alpha, beta, gamma, and error (the error
#'     on synthetic anomalies).
#' @export
strats_tune <- function (T, dependency,
                         T_normal = NULL,
                         n_normal = NULL,
                         maxiter = 100) {
    set.seed (1)
    do.call (rbind, lapply (1:20, function (seed) {
        set.seed (seed)
        alpha <- signif (10^runif (1, -3, 3), 1)
        beta <- signif (10^runif (1, -3, 3), 1)
        gamma <- signif (10^runif (1, -3, 3), 1)
        error <- strats_test (T, alpha, beta, gamma, dependency,
                              T_normal = T_normal,
                              n_normal = n_normal,
                              maxiter = maxiter)
        cat (sprintf ("Alpha: %g, beta: %g, gamma: %g, error: %g\n",
                      alpha, beta, gamma, error),
             file = stderr ())
        tibble::tibble (alpha = alpha, beta = beta, gamma = gamma,
                        error = error)
    }))
}