About this AppWith this app we want to provide a short overview over different model selection techniques. You can try out how these techniques vary for different data sets and underlying structures. On the other tabs of the app, you have a lot of opportunities to try out all this stuff. AIC and BICThe Akaike information criterion (AIC) and the Bayesian information criterion (BIC) are criteria for model selection. Both calculate an estimate of the model quality relative to other models. The model with the lowest AIC/BIC gets selected out of a set of models. Formulas\(\text{AIC} = 2k - 2\ln(\hat L)\) \(\text{BIC} = \ln(n)k - 2\ln(\hat L)\) Cross ValidationCross Validation is a model validation technique that tries to calculate how good the model can predict data by predicting parts of the dataset by another part.
Therefore, the cross-validation method consists of two steps: At first you split your data into a training set and a testing set. After that you train your model on the training set and predict the testing set with the estimated model. |
# calculate AIC + BIC
calc_aic_bic <- function(max.poly, data) {
# df to store AIC + BIC values
df <- data.frame(measure=c(rep("AIC",max.poly),rep("BIC",max.poly)),
value=numeric(max.poly * 2),
degree=rep(1:max.poly, 2))
# AIC + BIC over the max.poly models
for(i in 1:max.poly){
df[i,2] <- AIC(lm(y~poly(x,i), data))
df[i+max.poly,2] <- BIC(lm(y~poly(x,i), data))
}
return(df)
}
# plot AIC + BIC
plot_aic_bic <- function(data) {
p <- ggplot(data=data, aes(x=degree, y=value, group=measure, colour=measure))
p <- p + geom_line(size=.8)
p <- p + geom_point(size=3)
p <- p + ggtitle(label="AIC and BIC")
p <- p + xlab("Number of polynomials in the model") + labs(colour="Criterion")
p <- p + theme_bw()
p <- p + theme(panel.background=element_rect(fill="#F0F0F0"), # Set the entire chart region to a light gray color
plot.background=element_rect(fill="#F0F0F0"),
panel.grid.major=element_line(colour="#D0D0D0",size=.5), # Format the grid
panel.border=element_rect(colour="#F0F0F0"),
#panel.border = element_rect(size=1.1, colour="#535353", fill=NA),
axis.ticks=element_blank(),
axis.text.x = element_text(),
axis.text.y = element_blank(), # Drop y axis text
axis.title.x = element_text(size = 15,colour = "#535353"), # Color axis titles
axis.title.y = element_blank(),
legend.background = element_rect(fill="#F0F0F0"), # Work legend
legend.title = element_text(size=14, colour="#535353"),
legend.text = element_text(size=14, colour="#535353"),
plot.title = element_text(hjust = .5, size=20, colour="#535353"))
p <- p + scale_x_discrete(limits=c(1:max(data$degree))) # Scale x axis according to number of polynomials in the model
# Add solid black line to bottom of the plot (only needed if panel.border not set)
p <- p + geom_hline(yintercept=min(data$value) - 1,size=1,colour="#535353")
return(p)
}
# the figures require ggplot2 library and
# all packages it depends on
library(ggplot2)
library(polynom)
library(RColorBrewer)
# simulates data for a polynomial model and adds noise
#
# @param sample_size: number of simulated observations
# @param poly_vec: a vector containing the coefficients of a polynomial
# @param noise: the amount of noise which will be added to the expected
# values of the polynomial
simulateData <- function(sample_size, poly_vec, noise){
f <- as.function(polynomial(poly_vec))
# generate the x predictor
x <- runif(sample_size, -5, 5)
# calculate y values
y <- f(x)
y <- y + rnorm(sample_size, sd=noise)
data.frame(x=x, y=y)
}
# fitting lm() polynomials of increasing complexity
# (up to max.degree) and storing their predictions
# in the new.dat data.frame
fitModels <- function(dat, max.poly) {
res <- lapply(1:max.poly, FUN=function(i) {as.function(polynomial(coef(lm(y ~ poly(x, i, raw=TRUE, simple=TRUE), data = dat))))})
return(res)
}
plotModels <- function(Data, max.poly){
estimated_functions <- fitModels(Data, max.poly)
#colors <- c(brewer.pal(9,"Blues"),"#000000")
colors <- brewer.pal(10, "RdYlGn")
names(colors) <- rep(1:10)
lim_y_min <- -50000; lim_y_max <- 50000
# plotting the data and the fitted models
p <- ggplot()
p <- p + geom_point(aes(x, y), Data, colour = "darkgrey")
# Set the entire chart region to a light gray color
p <- p + theme_bw()
p <- p + theme(panel.background=element_rect(fill="#F0F0F0"),
plot.background=element_rect(fill="#F0F0F0"),
panel.border=element_rect(colour="#F0F0F0"))
# Format the grid
p <- p + theme(panel.grid.major=element_line(colour="#D0D0D0",size=1), axis.ticks=element_blank())
# Drop axis text, color axis titles
p <- p + theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.title = element_text(size = 11,colour = "#535353"))
# Set y limits to keep the grid fixed
p <- p + ylim(lim_y_min,lim_y_max)
# Add solid black line to bottom of the plot
p <- p + geom_hline(yintercept=lim_y_min,size=.5,colour="#535353")
# Work legend
p <- p + theme(legend.background = element_rect(fill="#F0F0F0"),
legend.title = element_text(size=8, colour = "#535353"),
legend.text = element_text(size = 9, colour = "#535353"),
legend.position = "center")
degree <- 0
for (f in estimated_functions) {
degree <- degree + 1
p <- p + stat_function(data = data.frame(x = -5:5,
degree = rep(as.character(degree), 11)),
fun = f,
aes(colour = degree))
}
p <- p + scale_color_manual(values=colors,
name="Degree",
position="right")
p
}
plotGenerativeModel <- function(poly_vec, noise=NA, min=-5, max=5){
f <- as.function(polynomial(poly_vec))
pol <- as.character(polynomial(poly_vec))
x <- data.frame(x=seq(min, max, 0.01))
y <- f(x)
upper <- as.vector(y) + noise
lower <- as.vector(y) - noise
d <- data.frame(x=x, y=y, upper=upper, lower=lower)
p <- ggplot(data=x, aes(x=x))
p <- p + stat_function(fun=f) #geom_line(aes(x, y), d)
p <- p + geom_ribbon(aes(x=x, ymax=upper, ymin=lower), d, alpha="0.5")
p <- p + annotate('text', x = 0, y = max(upper), label=pol, parse=TRUE)#ggtitle(tits)
p
}
add_bins <- function(data, n_bins, seed) {
n <- nrow(data)
bins <- 1:n_bins
k <- ceiling(n / n_bins)
set.seed(seed)
data$bin <- sample(rep(bins, k)[1:n])
return(data)
}
split_data <- function(data, bin) {
list(train = data[data$bin != bin, ],
test = data[data$bin == bin, ])
}
#' Function to cross validate a dataset
#'
#' @param data data.frame of x and y values, x values have to be divisible by .01
#' @param n_bins number of bins you want to split the data for cross validation
#' @param max.poly maximum degree of polynomial you want to validate
#' @param seed seed to control randomness
#'
#' @returns data.frame of all points of data with estimated y values
#' for each polynomial degree and the calculated MSEs
validate_cross <- function(data, n_bins, max.poly = 2, seed = 1337) {
data <- add_bins(data, n_bins, seed)
n <- length(n_bins) * max.poly * nrow(data)
result <- data.frame()
for (bin in 1:n_bins) {
split <- split_data(data, bin)
estimated_functions <- fitModels(split$train, max.poly)
test_df <- split$test
test_df <- test_df[order(test_df$x),]
rownames(test_df) <- 1:nrow(test_df)
degree <- 0
for (f in estimated_functions) {
degree <- degree + 1
tmp <- test_df
tmp$estimate <- f(tmp$x)
tmp$degree <- degree
result <- rbind(result, tmp)
}
}
result$sqerr <- (result$y - result$estimate)^2
return(result[order(result$degree),])
}
validation_se <- function(data, n_bins, max.poly = 2, seed = 1337) {
result <- validate_cross(data, n_bins, max.poly=max.poly, seed=seed)
# calculate mean for each bin
result <- aggregate(sqerr ~ degree + bin, result, mean)
colnames(result)[colnames(result) == "sqerr"] <- "mse"
result$n <- 1
# TODO improve this code!
res <- aggregate(mse ~ degree, result, mean)
res$sd <- aggregate(mse ~ degree, result, sd)$mse
res$n <- aggregate(n ~ degree, result, length)$n
res$se <- res$sd / sqrt(res$n)
return(res)
}
ModelTable <- function(Polynom){
m <- matrix(0, nrow=Polynom+1, ncol=1)
rownames(m) <- c("Intercept", paste("X", 1:Polynom, sep="^"))
#print(m)
m
}
###### Plots ######
plotCrossValidation <- function(seData){
p <- ggplot(data=seData, aes(x=degree, y=mse))
p <- p + geom_line(size=1.2, colour="#999999")
p <- p + geom_errorbar(aes(x=degree, ymin=mse-se, ymax=mse+se), data = seData, width=0.25)
p <- p + xlab("Degree of polynomial") + ylab("Mean Squared error")
p <- p + ggtitle(label="Results of crossvalidation")
# Set the entire chart region to a light gray color
p <- p + theme_bw()
p <- p + theme(panel.background=element_rect(fill="#F0F0F0"),
plot.background=element_rect(fill="#F0F0F0"),
panel.border=element_rect(colour="#F0F0F0"),
#panel.border = element_rect(size=1.1, colour="#535353", fill=NA),
panel.grid.major=element_line(colour="#D0D0D0",size=.5), # Format grid
axis.ticks=element_blank(),
axis.text.x = element_text(), # Drop y axis text, color axis titles, add plot border
axis.text.y = element_blank(),
axis.title = element_text(size=15, colour="#535353"),
legend.position = "none", # Drop legend
plot.title = element_text(hjust = .5, size=20, colour="#535353")) # Work title
p <- p + scale_x_discrete(limits=c(1:max(seData$degree)))
# Add solid black line to bottom of the plot (only needed if panel.border not set)
p <- p + geom_hline(yintercept=min(seData$mse) - 1, size=1, colour="#535353")
return(p)
}
library(ggplot2)
library(shiny)
library(shinyBS)
library(shinyjs)
library(plotly)
library(polynom)
library(gridExtra)
source("helpers.R")
source("varianceOfFunction.R")
source("plot_crossValidation.R")
source("aicbic.R")
shinyServer(function(input, output) {
Data <- reactive({
if(input$Simulate == 0){
NULL
}
isolate(
simulateData(input$Sample, Model(), Noise())
)
})
# Calculates the sd of the noise based on the variance of the selected function
# and the proportion of noise in the total variance
Noise <- reactive({
varFunction <- varf(polynomial(Model()), -5, 5)
varNoise <- varFunction * input$Noise / (1-input$Noise)
sdNoise <- sqrt(varNoise)
ifelse(sdNoise == 0, input$Noise, sdNoise)
})
output$ModelPlot <- renderPlotly({
pdf(NULL)
plotModels(Data(), input$max.poly)
})
# remove the previous plots
observeEvent(input$Simulate,{
graphics.off()
#print(Model())
})
# update the plots upon pressing button Crossvalidate
FPlot <- eventReactive(input$Crossvalidate, {
grid.arrange(
plot_aic_bic(calc_aic_bic(max.poly = input$max.poly, data = Data())),
plotCrossValidation(validation_se(Data(),
input$n.bins,
input$max.poly)),
ncol=2
)
})
# Render the BIC / crossvalidation plot
output$FitPlot <- renderPlot({FPlot()})
#observeEvent(input$Simulate, {print(Mod)})
# plot the generative model
output$GenerativeModel <- renderPlot({
plotGenerativeModel(poly_vec = Model(),
noise=Noise())
})
#
# ModelTable <- reactive({
# isolate(
# m <- matrix(0, nrow=input$Polynom+1, ncol=1),
# rownames(m) <- c("Intercept", paste("X", 1:input$Polynom, sep="^")),
# m
# )
#
# })
Mod <- matrix(c(5,2,1,2,4,5), nrow=6) #reactiveValues()
#Mod$table <- matrix(c(5,2,1,2), nrow=4)
observeEvent(input$Polynom, {
Mod <<- matrix(0, nrow=input$Polynom+1, ncol=1)#ModelTable(input$Polynom)
})
# Mod$table <- eventReactive(input$Polynom,{
# print(input$Polynom)
# ModelTable(input$Polynom)
# })
Model <- reactive({
#Terrible, terrible workaround
selCoef <- which(as.list(
c("Intercept",paste("X", 1:input$Polynom, sep="^")))==input$SelectCoef)
Mod[selCoef,] <<- input$Coefficient
Mod
})
output$CoefSelect <- renderUI({
selectInput("SelectCoef", "Select the coefficient", input$Polynom,
choices = as.list(c("Intercept", paste("X", 1:input$Polynom, sep="^"))))
})
})
library(ggplot2)
library(shiny)
library(shinyBS)
library(shinyjs)
library(plotly)
source("helpers.R")
shinyUI(
# Navbar
navbarPage("AIC/BIC vs Cross-Validation",
# Tab1 title
tabPanel("Model Settings",
bsModal(id = "DefineModel", title = "Define your generative model",
size = "large", trigger = "ModelInit",
sidebarPanel(
numericInput("Polynom",
"Select the highest polynom of the generative function",
min = 1,
max = 10,
value = 3),
uiOutput("CoefSelect"),
# textInput("Coefficients", "Input the coefficients for the polynomials (and intercept)",
# value="5,2,1,2"),
numericInput("Coefficient", "Select the coefficient", value=0),
sliderInput("Noise", "Define the amount of noise", min = 0, max = 1, step=0.01, value=0.3),
#actionButton(inputId = "initiatePlotGenerative", label = "Plot the model"),
actionButton(inputId = "saveGenerative", label="Save the model")
),
mainPanel(plotOutput("GenerativeModel"))
),
# Sidebar with a slider input for number of bins
sidebarLayout(
sidebarPanel(
includeScript("../../../Matomo-tquant.js"),
actionButton(inputId = "ModelInit", label = "Define your own model"),
h3("Simulate data"),
numericInput("Sample",
"Sample Size:",
min = 20,
max = 1000,
value = 100),
actionButton(inputId="Simulate", "Simulate new data"),
h3("Specify models to fit"),
sliderInput("max.poly",
"Select the maximum degree of polynomial:",
min = 1,
max = 10,
value = 7)
),
# Show a plot of the generated distribution
mainPanel(
plotlyOutput("ModelPlot")
)
)
),
# Tab2 title
tabPanel("Results",
#Tab2 sidebar
sidebarLayout(
sidebarPanel(
numericInput("n.bins", "Select the number of folds in a cross-validation",
min=2,
max=10,
value=5),
actionButton("Crossvalidate", "Crossvalidate!")
),
mainPanel(
)
),
plotOutput("FitPlot")
),
# Tab3 title
tabPanel("About",
fluidRow(
column(width = 6, offset = 3,
withMathJax(includeMarkdown("description.Rmd"))
)
)
)
)
)
varf <-function(f, low, upp){
suma <- polynom::integral(f, c(low,upp))
meana <- suma / (upp-low)
r.sq <- integrate(X.sq, low, upp, f)
abs(r.sq$value - meana^2)
}
X.sq <- function(x, fun){
as.function(fun)(x^2)
}