From b63378c53850a7a7b7cced56957c2ce5ec2c4465 Mon Sep 17 00:00:00 2001 From: Florian Oswald Date: Fri, 13 Nov 2020 23:23:21 +0100 Subject: [PATCH 1/4] added bias variance tradeoff app --- .../bias_variance/app.R | 124 ++++++++++++++++++ inst/tutorial/chapter2/chapter2.Rmd | 3 +- 2 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 inst/shinys/bias_variance_tradeoff/bias_variance/app.R diff --git a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R new file mode 100644 index 0000000..99edd2b --- /dev/null +++ b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R @@ -0,0 +1,124 @@ +# +# This is a Shiny web application. You can run the application by clicking +# the 'Run App' button above. +# +# Find out more about building applications with Shiny here: +# +# http://shiny.rstudio.com/ +# + +library(shiny) +library(ggplot2) +library(dplyr) +library(gghighlight) +library(cowplot) +library(tidyr) + +getmodels <- function(x,y,newx,dfs = 2:20){ + r = data.frame(x=x,y=y) + o = data.frame(x=newx) + s = list() + # browser() + for (i in 1:length(dfs)){ + if (dfs[i] == 2){ + s[[i]] <- lm(y~x,r) + o = cbind(o, predict(s[[i]], newdata = o)) + } else { + s[[i]] <- smooth.spline(x,y,df = dfs[i]) + o = cbind(o, predict(s[[i]], o$x)$y) + } + } + names(o)[-c(1)] <- paste0("df",dfs) + names(s) <- paste0("df",dfs) + list(models = s, pred = o, original = r) +} + +data_alldf <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,ub = 5,nnew = 200,maxdf=60){ + set.seed(1234) + eps = 1 # hard code variance + + r = data.frame(x = seq(0,ub,length.out = n)) + r$truth = fun(r$x) + r$epsi = rnorm(n,mean = 0, sd = eps) + r$y = r$truth + r$epsi + + mods = getmodels(r$x,r$y,seq(0,ub, length.out = nnew),dfs = seq(2,maxdf,by = 2)) + # add test data to predictions + mods$pred$truth = fun(mods$pred$x) + mods$pred$testdata = mods$pred$truth + rnorm(nnew,mean = 0, sd = eps) + mods$plotdata = mods$pred %>% + select(-testdata) %>% + tidyr::pivot_longer(-x,names_to = "flexibility") + + # mses and bias + mses = list( + train = colMeans(sapply(mods$models,residuals)^2) + ) # test mses + mses$test <- colMeans((mods$pred[,names(mods$models)] - mods$pred[,"testdata"])^2) + + # bias + mses$bias <- colMeans((mods$pred[,names(mods$models)] - mods$pred[,"truth"])^2) + mses$var <- diag(var(mods$pred[,names(mods$models)])) + + mses$plotdata <- data.frame(mses) + mses$plotdata$flexibility = seq(2,maxdf,by = 2) + mses$plotdata = mses$plotdata %>% tidyr::pivot_longer(-flexibility, names_to = "variable") + + + list(models = mods,mses = mses) + +} + + + +plot_singledf <- function(x,df){ + d = x$models$plotdata + m = x$mses$plotdata %>% filter(variable %in% c("test","train")) + + p = ggplot(d %>% filter(flexibility != "truth")) + p = p + geom_line(aes(x = x, y = value, color = flexibility), size = 1) + gghighlight(flexibility == paste0("df",df)) + p = p + geom_line(data = d %>% filter(flexibility == "truth"), aes(x=x,y = value), color = "black", size = 1) + p = p + geom_point(data = x$models$original, aes(x,y), shape = 1,size = 2) + theme_bw() + + b = ggplot(m, aes(x=flexibility,y = value, color = variable)) + + geom_point() + gghighlight(flexibility == df) + b = b + geom_hline(yintercept = 1, linetype = "dashed", color = "grey")+ theme_bw() + cowplot::plot_grid(p,b) +} + + +# Define UI for application that draws a histogram +ui <- fluidPage( + + # Application title + titlePanel("Bias Variance Tradeoff"), + + # Sidebar with a slider input for number of bins + sidebarLayout( + sidebarPanel( + sliderInput("df", + "Degrees of Freedom:", + min = 2, + max = 60, + step = 2, + value = 2) + ), + + # Show a plot of the generated distribution + mainPanel( + plotOutput("dfPlot") + ) + ) +) + +# Define server logic required to draw a histogram +server <- function(input, output) { + d = data_alldf() + + output$dfPlot <- renderPlot({ + plot_singledf(d,input$df) + }) +} + +# Run the application +shinyApp(ui = ui, server = server) \ No newline at end of file diff --git a/inst/tutorial/chapter2/chapter2.Rmd b/inst/tutorial/chapter2/chapter2.Rmd index e840698..cb83292 100644 --- a/inst/tutorial/chapter2/chapter2.Rmd +++ b/inst/tutorial/chapter2/chapter2.Rmd @@ -10,8 +10,7 @@ runtime: shiny_prerendered ```{r setup, include=FALSE} library(learnr) knitr::opts_chunk$set(echo = FALSE) -library(EnvStats, quietly = TRUE, warn.conflicts = FALSE, verbose = FALSE) # For Pareto Distribution -# library(distr) +library(EnvStats) # For Pareto Distribution x = c(6,2,5,3,5,1,5,7,6,3) ``` From 1bea9623f3c4bc507ca4e314dbf223e1c07dc3c2 Mon Sep 17 00:00:00 2001 From: Florian Oswald Date: Sat, 14 Nov 2020 22:41:51 +0100 Subject: [PATCH 2/4] improved app --- .../bias_variance/app.R | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R index 99edd2b..9f9ac7a 100644 --- a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R +++ b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R @@ -14,6 +14,12 @@ library(gghighlight) library(cowplot) library(tidyr) +# globals +STEP = 10 +MAXDF = 60 +EPS = 1 +DFS = c(2:8,seq(10,MAXDF,by = STEP)) + getmodels <- function(x,y,newx,dfs = 2:20){ r = data.frame(x=x,y=y) o = data.frame(x=newx) @@ -33,19 +39,19 @@ getmodels <- function(x,y,newx,dfs = 2:20){ list(models = s, pred = o, original = r) } -data_alldf <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,ub = 5,nnew = 200,maxdf=60){ +data_alldf <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,ub = 5,nnew = 200){ set.seed(1234) - eps = 1 # hard code variance + # eps = 1 # hard code variance r = data.frame(x = seq(0,ub,length.out = n)) r$truth = fun(r$x) - r$epsi = rnorm(n,mean = 0, sd = eps) + r$epsi = rnorm(n,mean = 0, sd = EPS) r$y = r$truth + r$epsi - mods = getmodels(r$x,r$y,seq(0,ub, length.out = nnew),dfs = seq(2,maxdf,by = 2)) + mods = getmodels(r$x,r$y,seq(0,ub, length.out = nnew),dfs = DFS) # add test data to predictions mods$pred$truth = fun(mods$pred$x) - mods$pred$testdata = mods$pred$truth + rnorm(nnew,mean = 0, sd = eps) + mods$pred$testdata = mods$pred$truth + rnorm(nnew,mean = 0, sd = EPS) mods$plotdata = mods$pred %>% select(-testdata) %>% tidyr::pivot_longer(-x,names_to = "flexibility") @@ -61,7 +67,7 @@ data_alldf <- function(fun = function(x) {x*sin(x-2) + 0.2*x},n=90,ub = 5,nnew = mses$var <- diag(var(mods$pred[,names(mods$models)])) mses$plotdata <- data.frame(mses) - mses$plotdata$flexibility = seq(2,maxdf,by = 2) + mses$plotdata$flexibility = DFS mses$plotdata = mses$plotdata %>% tidyr::pivot_longer(-flexibility, names_to = "variable") @@ -78,11 +84,12 @@ plot_singledf <- function(x,df){ p = ggplot(d %>% filter(flexibility != "truth")) p = p + geom_line(aes(x = x, y = value, color = flexibility), size = 1) + gghighlight(flexibility == paste0("df",df)) p = p + geom_line(data = d %>% filter(flexibility == "truth"), aes(x=x,y = value), color = "black", size = 1) - p = p + geom_point(data = x$models$original, aes(x,y), shape = 1,size = 2) + theme_bw() + p = p + geom_point(data = x$models$original, aes(x,y), shape = 1,size = 2) + theme_bw() + labs(title = "Fitting Data on f: f(x) + e", caption = "Solid black line is true f. Circles are realizations of f(X) + e: the data.") + ylab("Estimate of f") b = ggplot(m, aes(x=flexibility,y = value, color = variable)) + - geom_point() + gghighlight(flexibility == df) - b = b + geom_hline(yintercept = 1, linetype = "dashed", color = "grey")+ theme_bw() + geom_point(size = 3, shape = 15) + gghighlight(flexibility == df) + b = b + geom_line(data = m, aes(x=flexibility, y = value, group = variable), color = "grey") + ylab("MSE") + xlab("degree of flexibility (degrees of freedom)") + b = b + geom_hline(yintercept = 1, linetype = "dashed", color = "grey")+ theme_bw() + labs(title = "Mean Squared Errors") cowplot::plot_grid(p,b) } @@ -96,12 +103,13 @@ ui <- fluidPage( # Sidebar with a slider input for number of bins sidebarLayout( sidebarPanel( - sliderInput("df", + shinyWidgets::sliderTextInput("df", "Degrees of Freedom:", - min = 2, - max = 60, - step = 2, - value = 2) + choices = DFS) + # min = 2, + # max = MAXDF, + # step = 1, + # value = 2) ), # Show a plot of the generated distribution From 8d40404358adae4bbd82be7dba7b43968ccc960e Mon Sep 17 00:00:00 2001 From: Florian Oswald Date: Sat, 14 Nov 2020 22:42:47 +0100 Subject: [PATCH 3/4] missed a package --- inst/shinys/bias_variance_tradeoff/bias_variance/app.R | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R index 9f9ac7a..b04ee4e 100644 --- a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R +++ b/inst/shinys/bias_variance_tradeoff/bias_variance/app.R @@ -8,6 +8,7 @@ # library(shiny) +library(shinyWidgets) library(ggplot2) library(dplyr) library(gghighlight) From 525804628b90387655033c8a78966021f0225831 Mon Sep 17 00:00:00 2001 From: Florian Oswald Date: Tue, 24 Nov 2020 17:47:06 +0100 Subject: [PATCH 4/4] moved --- inst/shinys/bias_variance_tradeoff/{bias_variance => }/app.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename inst/shinys/bias_variance_tradeoff/{bias_variance => }/app.R (100%) diff --git a/inst/shinys/bias_variance_tradeoff/bias_variance/app.R b/inst/shinys/bias_variance_tradeoff/app.R similarity index 100% rename from inst/shinys/bias_variance_tradeoff/bias_variance/app.R rename to inst/shinys/bias_variance_tradeoff/app.R