download_data <- function(data = "Graz_prepared.rds") {
  file <- paste0("https://nikum.org/dmr/Data/", data)
  tdir <- tempfile()
  dir.create(tdir)
  download.file(file, file.path(tdir, data))
  return(readRDS(file.path(tdir, data)))
  }

d <- download_data("Graz_prepared.rds")

ds <- read.csv2("https://nikum.org/dmr/Data/ECMWF_parameter_description.txt", skip = 8)

library(zoo)

#some fun preprocessing

d_training <- list()
d_test <- list()
  
create_tst_tr <- function(dat){
  tmp <- zoo(dat)
  dates <- rownames(tmp)
  tmp$month <- as.POSIXlt(dates)$mon
  tmp$year <- as.POSIXlt(dates)$year + 1900
  return(list(training=as.data.frame(subset(tmp, time(tmp) < "2023-09-01")), 
              test=as.data.frame(subset(tmp, (time(tmp) >= "2023-09-01") & (time(tmp) <= "2023-12-31")))))
}

preprocessed_data <- lapply(d, create_tst_tr)

# try some intercept only models to choose appropriate candidates for the distribution

library(gamlss2)
library(gamlss)

families <- bamlss::gamlss_distributions(type="continuous")
names(families)

ll <- rep(NA, length(families))
names(ll) <- names(families)

for(i in seq_along(families)){
  b <- try(gamlss2(obs_temp~1, data=preprocessed_data$step_12$training, family = families[[i]]))
  if(!inherits(b, "try-error")){
    ll[i] <- logLik(b)
  }
}

sort(ll)
barplot(tail(sort(ll)))

# all of these are almost as good as the others, there is also no real advantage in using BCPEo (the best one)
# no other parameters were fitted beyond the mean when trying to use stabsel, so there doesn't seem to be a point
# so we revert back to NO here

interceptmod <- gamlss2(obs_temp ~ 1, family=NO, data=preprocessed_data$step_12$training)

logLik(interceptmod)
coef(interceptmod)
summary(interceptmod)
plot(interceptmod)

potential_predictors <- colnames(preprocessed_data$step_12$training)
drop_elements <- list("obs_temp", "obs_wind", "obs_rain", "month", "year")
potential_predictors <- potential_predictors[-which(potential_predictors %in% drop_elements)]

f <- '~ s(month)'
for (predictor in potential_predictors){
  f <- paste0(f, " + s(", predictor, ")")
}

f <- rep(f, 2)
f[[1]] <- paste0("obs_temp ", f[[1]])

library(bamlss)

pick_pred_run <- stabsel(f, data=preprocessed_data$step_12$training, family=NO, q=10, thr=0.5, B=50)
plot(pick_pred_run)

result_f <- formula(pick_pred_run) 
print(result_f) 

# saving the model here manually just in case reruns of this won't give the same predictors

result_f <- obs_temp ~ s(fdir) + s(lightningd6) + s(q850) + s(slhf) + s(sund) + s(t2m) + s(u10m)

# only mu was fitted, so we don't need sigma in the following

# we used step12 to choose model parameters and then fit the same model for each training set
# write function that does the fitting and returns the loglikelihoods

fit_compute_loglik <- function(dat){
  mod <- gamlss2(result_f, family=NO, data=dat$training)
  p <- predict(mod, newdata=dat$test, type="parameter")
  loglik_res <- family(mod)$loglik(dat$test$obs_temp, p)
  
  return(loglik_res)
}

result_loglik <- lapply(preprocessed_data, fit_compute_loglik)
sum(unlist(result_loglik))

## -913.5339.