AIC/BIC vs Cross-Validation

show with app

# calculate AIC + BIC
calc_aic_bic <- function(max.poly, data) {
  # df to store AIC + BIC values
  df <- data.frame(measure=c(rep("AIC",max.poly),rep("BIC",max.poly)),
                   value=numeric(max.poly * 2),
                   degree=rep(1:max.poly, 2))
  
  # AIC + BIC over the max.poly models
  for(i in 1:max.poly){
    df[i,2] <- AIC(lm(y~poly(x,i), data))
    df[i+max.poly,2] <- BIC(lm(y~poly(x,i), data))
  }
  return(df)
}

# plot AIC + BIC
plot_aic_bic <- function(data) {
  p <- ggplot(data=data, aes(x=degree, y=value, group=measure, colour=measure))
  p <- p + geom_line(size=.8)
  p <- p + geom_point(size=3)
  p <- p + ggtitle(label="AIC and BIC")
  p <- p + xlab("Number of polynomials in the model") + labs(colour="Criterion")
  p <- p + theme_bw()
  p <- p + theme(panel.background=element_rect(fill="#F0F0F0"), # Set the entire chart region to a light gray color
                 plot.background=element_rect(fill="#F0F0F0"),
                 panel.grid.major=element_line(colour="#D0D0D0",size=.5), # Format the grid
                 panel.border=element_rect(colour="#F0F0F0"),
                 #panel.border = element_rect(size=1.1, colour="#535353", fill=NA),
                 axis.ticks=element_blank(),
                 axis.text.x = element_text(), 
                 axis.text.y = element_blank(), # Drop y axis text
                 axis.title.x = element_text(size = 15,colour = "#535353"), # Color axis titles
                 axis.title.y = element_blank(),
                 legend.background = element_rect(fill="#F0F0F0"), # Work legend
                 legend.title = element_text(size=14, colour="#535353"),
                 legend.text = element_text(size=14, colour="#535353"),
                 plot.title = element_text(hjust = .5, size=20, colour="#535353"))
  p <- p + scale_x_discrete(limits=c(1:max(data$degree))) # Scale x axis according to number of polynomials in the model
  # Add solid black line to bottom of the plot (only needed if panel.border not set)
  p <- p + geom_hline(yintercept=min(data$value) - 1,size=1,colour="#535353")

  return(p)
}

# the figures require ggplot2 library and
# all packages it depends on
library(ggplot2)
library(polynom)
library(RColorBrewer)

# simulates data for a polynomial model and adds noise
#
# @param sample_size: number of simulated observations
# @param poly_vec: a vector containing the coefficients of a polynomial
# @param noise: the amount of noise which will be added to the expected
#               values of the polynomial
simulateData <- function(sample_size, poly_vec, noise){
  f <- as.function(polynomial(poly_vec))

  # generate the x predictor
  x <- runif(sample_size, -5, 5)

  # calculate y values
  y <- f(x)

  y <- y + rnorm(sample_size, sd=noise)
  data.frame(x=x, y=y)
}

# fitting lm() polynomials of increasing complexity
# (up to max.degree) and storing their predictions
# in the new.dat data.frame
fitModels <- function(dat, max.poly) {
  res <- lapply(1:max.poly, FUN=function(i) {as.function(polynomial(coef(lm(y ~ poly(x, i, raw=TRUE, simple=TRUE), data = dat))))})

  return(res)
}


plotModels <- function(Data, max.poly){
  estimated_functions <- fitModels(Data, max.poly)
  #colors <- c(brewer.pal(9,"Blues"),"#000000")
  colors <- brewer.pal(10, "RdYlGn")
  names(colors) <- rep(1:10)
  lim_y_min <- -50000; lim_y_max <- 50000
  # plotting the data and the fitted models
  p <- ggplot()
  p <- p + geom_point(aes(x, y), Data, colour = "darkgrey")
  # Set the entire chart region to a light gray color
  p <- p + theme_bw()
  p <- p + theme(panel.background=element_rect(fill="#F0F0F0"),
                 plot.background=element_rect(fill="#F0F0F0"),
                 panel.border=element_rect(colour="#F0F0F0"))
  # Format the grid
  p <- p + theme(panel.grid.major=element_line(colour="#D0D0D0",size=1), axis.ticks=element_blank())
  # Drop axis text, color axis titles
  p <- p + theme(axis.text.x = element_blank(), 
                 axis.text.y = element_blank(), 
                 axis.title = element_text(size = 11,colour = "#535353"))
  # Set y limits to keep the grid fixed
  p <- p + ylim(lim_y_min,lim_y_max)
  # Add solid black line to bottom of the plot
  p <- p + geom_hline(yintercept=lim_y_min,size=.5,colour="#535353")
  # Work legend
  p <- p + theme(legend.background = element_rect(fill="#F0F0F0"), 
                 legend.title = element_text(size=8, colour = "#535353"),
                 legend.text = element_text(size = 9, colour = "#535353"),
                 legend.position = "center")
  degree <- 0
  for (f in estimated_functions) {
    degree <- degree + 1
    p <- p + stat_function(data = data.frame(x = -5:5,
                                             degree = rep(as.character(degree), 11)),
                           fun = f,
                           aes(colour = degree))
  }
  p <- p + scale_color_manual(values=colors,
                              name="Degree",
                              position="right")
  p
}

plotGenerativeModel <- function(poly_vec, noise=NA, min=-5, max=5){
  f <- as.function(polynomial(poly_vec))
  pol <- as.character(polynomial(poly_vec))
  x <- data.frame(x=seq(min, max, 0.01))
  y <- f(x)
  upper <- as.vector(y) + noise
  lower <- as.vector(y) - noise
  d <- data.frame(x=x, y=y, upper=upper, lower=lower)
  p <- ggplot(data=x, aes(x=x))
  p <- p + stat_function(fun=f) #geom_line(aes(x, y), d)
  p <- p + geom_ribbon(aes(x=x, ymax=upper, ymin=lower), d, alpha="0.5")
  p <- p + annotate('text', x = 0, y = max(upper), label=pol, parse=TRUE)#ggtitle(tits)
  p
}

add_bins <- function(data, n_bins, seed) {
  n <- nrow(data)
  bins <- 1:n_bins
  k <- ceiling(n / n_bins)

  set.seed(seed)
  data$bin <- sample(rep(bins, k)[1:n])

  return(data)
}


split_data <- function(data, bin) {
  list(train = data[data$bin != bin, ],
       test = data[data$bin == bin, ])
}


#' Function to cross validate a dataset
#'
#' @param data data.frame of x and y values, x values have to be divisible by .01
#' @param n_bins number of bins you want to split the data for cross validation
#' @param max.poly maximum degree of polynomial you want to validate
#' @param seed seed to control randomness
#'
#' @returns data.frame of all points of data with estimated y values
#' for each polynomial degree and the calculated MSEs
validate_cross <- function(data, n_bins, max.poly = 2, seed = 1337) {
  data <- add_bins(data, n_bins, seed)
  n <- length(n_bins) * max.poly * nrow(data)
  result <- data.frame()

  for (bin in 1:n_bins) {
    split <- split_data(data, bin)

    estimated_functions <- fitModels(split$train, max.poly)

    test_df <- split$test
    test_df <- test_df[order(test_df$x),]
    rownames(test_df) <- 1:nrow(test_df)

    degree <- 0
    for (f in estimated_functions) {
      degree <- degree + 1
      tmp <- test_df
      tmp$estimate <- f(tmp$x)
      tmp$degree <- degree

      result <- rbind(result, tmp)
    }
  }

  result$sqerr <- (result$y - result$estimate)^2

  return(result[order(result$degree),])
}

validation_se <- function(data, n_bins, max.poly = 2, seed = 1337) {
  result <- validate_cross(data, n_bins, max.poly=max.poly, seed=seed)

  # calculate mean for each bin
  result <- aggregate(sqerr ~ degree + bin, result, mean)
  colnames(result)[colnames(result) == "sqerr"] <- "mse"
  result$n <- 1

  # TODO improve this code!
  res <- aggregate(mse ~ degree, result, mean)
  res$sd <- aggregate(mse ~ degree, result, sd)$mse
  res$n <- aggregate(n ~ degree, result, length)$n

  res$se <- res$sd / sqrt(res$n)

  return(res)
}

ModelTable <- function(Polynom){
  
    m <- matrix(0, nrow=Polynom+1, ncol=1)
    rownames(m) <- c("Intercept", paste("X", 1:Polynom, sep="^"))
  #print(m)
      m
}

###### Plots ######

plotCrossValidation <- function(seData){
  
  p <- ggplot(data=seData, aes(x=degree, y=mse))
  p <- p + geom_line(size=1.2, colour="#999999")
  p <- p + geom_errorbar(aes(x=degree, ymin=mse-se, ymax=mse+se), data = seData, width=0.25)
  p <- p + xlab("Degree of polynomial") + ylab("Mean Squared error")
  p <- p + ggtitle(label="Results of crossvalidation")
  # Set the entire chart region to a light gray color
  p <- p + theme_bw()
  p <- p + theme(panel.background=element_rect(fill="#F0F0F0"),
                 plot.background=element_rect(fill="#F0F0F0"),
                 panel.border=element_rect(colour="#F0F0F0"),
                 #panel.border = element_rect(size=1.1, colour="#535353", fill=NA),
                 panel.grid.major=element_line(colour="#D0D0D0",size=.5), # Format grid
                 axis.ticks=element_blank(),
                 axis.text.x = element_text(), # Drop y axis text, color axis titles, add plot border
                 axis.text.y = element_blank(),
                 axis.title = element_text(size=15, colour="#535353"),
                 legend.position = "none", # Drop legend
                 plot.title = element_text(hjust = .5, size=20, colour="#535353")) # Work title
  p <- p + scale_x_discrete(limits=c(1:max(seData$degree)))
  # Add solid black line to bottom of the plot (only needed if panel.border not set)
  p <- p + geom_hline(yintercept=min(seData$mse) - 1, size=1, colour="#535353")
  
  return(p)
}

library(ggplot2)
library(shiny)
library(shinyBS)
library(shinyjs)
library(plotly)
library(polynom)
library(gridExtra)
source("helpers.R")
source("varianceOfFunction.R")
source("plot_crossValidation.R")
source("aicbic.R")

shinyServer(function(input, output) {
  Data <- reactive({
    if(input$Simulate == 0){
      NULL
    }
    isolate(
      simulateData(input$Sample,  Model(), Noise())
    )
  })
  
  # Calculates the sd of the noise based on the variance of the selected function
  # and the proportion of noise in the total variance 
  Noise <- reactive({
    varFunction <- varf(polynomial(Model()), -5, 5)
    varNoise <- varFunction * input$Noise / (1-input$Noise)
    sdNoise <- sqrt(varNoise)
    
    ifelse(sdNoise == 0, input$Noise, sdNoise)
  })
  
  output$ModelPlot <- renderPlotly({
    pdf(NULL)
    plotModels(Data(), input$max.poly)
  })
  # remove the previous plots
  observeEvent(input$Simulate,{
    graphics.off()
    #print(Model())
    })
  
  # update the plots upon pressing button Crossvalidate
  FPlot <- eventReactive(input$Crossvalidate, {
      grid.arrange(
        plot_aic_bic(calc_aic_bic(max.poly = input$max.poly, data = Data())),
        plotCrossValidation(validation_se(Data(),
                                          input$n.bins,
                                          input$max.poly)),
        ncol=2
      )
  })
  # Render the BIC / crossvalidation plot
  output$FitPlot <- renderPlot({FPlot()})

  #observeEvent(input$Simulate, {print(Mod)})
  # plot the generative model
  output$GenerativeModel <- renderPlot({
    plotGenerativeModel(poly_vec = Model(),
                        noise=Noise())

  })

  # 
  # ModelTable <- reactive({
  #   isolate(
  #     m <- matrix(0, nrow=input$Polynom+1, ncol=1),
  #     rownames(m) <- c("Intercept", paste("X", 1:input$Polynom, sep="^")),
  #     m
  #   )
  #   
  # })
  
  Mod <- matrix(c(5,2,1,2,4,5), nrow=6) #reactiveValues()
  #Mod$table <- matrix(c(5,2,1,2), nrow=4)
  observeEvent(input$Polynom, {
      Mod <<- matrix(0, nrow=input$Polynom+1, ncol=1)#ModelTable(input$Polynom)
  })
  # Mod$table <- eventReactive(input$Polynom,{
  #   print(input$Polynom)
  #   ModelTable(input$Polynom)
  # })
  
  Model <- reactive({
    #Terrible, terrible workaround
      selCoef <- which(as.list(
        c("Intercept",paste("X", 1:input$Polynom, sep="^")))==input$SelectCoef)
      Mod[selCoef,] <<- input$Coefficient
      Mod
  })

  output$CoefSelect <- renderUI({
    selectInput("SelectCoef", "Select the coefficient", input$Polynom,
                choices = as.list(c("Intercept", paste("X", 1:input$Polynom, sep="^"))))
  })
})

library(ggplot2)
library(shiny)
library(shinyBS)
library(shinyjs)
library(plotly)
source("helpers.R")

shinyUI(
  # Navbar
  navbarPage("AIC/BIC vs Cross-Validation",
             # Tab1 title
             tabPanel("Model Settings",

                        bsModal(id = "DefineModel", title = "Define your generative model",
                                size = "large", trigger = "ModelInit",
                                sidebarPanel(
                                  numericInput("Polynom",
                                               "Select the highest polynom of the generative function",
                                               min = 1,
                                               max = 10,
                                               value = 3),
                                  uiOutput("CoefSelect"),
                                  # textInput("Coefficients", "Input the coefficients for the polynomials (and intercept)",
                                  #          value="5,2,1,2"),
                                  numericInput("Coefficient", "Select the coefficient", value=0),
                                  sliderInput("Noise", "Define the amount of noise", min = 0, max = 1, step=0.01, value=0.3),
                                  #actionButton(inputId = "initiatePlotGenerative", label = "Plot the model"),
                                  actionButton(inputId = "saveGenerative", label="Save the model")
                                ),
                                mainPanel(plotOutput("GenerativeModel"))
                        ),
                        # Sidebar with a slider input for number of bins
                        sidebarLayout(
                          sidebarPanel(
                            includeScript("../../../Matomo-tquant.js"),
                            
                            actionButton(inputId = "ModelInit", label = "Define your own model"),
                            h3("Simulate data"),
                            numericInput("Sample",
                                         "Sample Size:",
                                         min = 20,
                                         max = 1000,
                                         value = 100),
                            actionButton(inputId="Simulate", "Simulate new data"),
                            h3("Specify models to fit"),
                            sliderInput("max.poly",
                                        "Select the maximum degree of polynomial:",
                                        min = 1,
                                        max = 10,
                                        value = 7)
                          ),

                          # Show a plot of the generated distribution
                          mainPanel(
                            plotlyOutput("ModelPlot")
                          )
                        )
             ),
             # Tab2 title
             tabPanel("Results",
                      #Tab2 sidebar
                      sidebarLayout(
                         sidebarPanel(
                           numericInput("n.bins", "Select the number of folds in a cross-validation",
                                        min=2,
                                        max=10,
                                        value=5),
                           actionButton("Crossvalidate", "Crossvalidate!")
                         ),
                         mainPanel(
                         )
                      ),
                      plotOutput("FitPlot")
             ),
             # Tab3 title
             tabPanel("About",
               fluidRow(
                 column(width = 6, offset = 3,
                   withMathJax(includeMarkdown("description.Rmd"))
                 )
               )
             )

  )
)

varf <-function(f, low, upp){
  suma <- polynom::integral(f, c(low,upp))
  meana <- suma / (upp-low)

  r.sq <- integrate(X.sq, low, upp, f)
  
  abs(r.sq$value - meana^2)
  
}


X.sq <- function(x, fun){
  as.function(fun)(x^2)
}

Simulate data

Specify models to fit

About this App

AIC and BIC

Formulas

Cross Validation