download_data <- function(data = "Graz_prepared.rds") { file <- paste0("https://nikum.org/dmr/Data/", data) tdir <- tempfile() dir.create(tdir) download.file(file, file.path(tdir, data)) return(readRDS(file.path(tdir, data))) } d <- download_data("Graz_prepared.rds") ds <- read.csv2("https://nikum.org/dmr/Data/ECMWF_parameter_description.txt", skip = 8) library(zoo) #some fun preprocessing d_training <- list() d_test <- list() create_tst_tr <- function(dat){ tmp <- zoo(dat) dates <- rownames(tmp) tmp$month <- as.POSIXlt(dates)$mon tmp$year <- as.POSIXlt(dates)$year + 1900 return(list(training=as.data.frame(subset(tmp, time(tmp) < "2023-09-01")), test=as.data.frame(subset(tmp, (time(tmp) >= "2023-09-01") & (time(tmp) <= "2023-12-31"))))) } preprocessed_data <- lapply(d, create_tst_tr) # try some intercept only models to choose appropriate candidates for the distribution library(gamlss2) library(gamlss) families <- bamlss::gamlss_distributions(type="continuous") names(families) ll <- rep(NA, length(families)) names(ll) <- names(families) for(i in seq_along(families)){ b <- try(gamlss2(obs_temp~1, data=preprocessed_data$step_12$training, family = families[[i]])) if(!inherits(b, "try-error")){ ll[i] <- logLik(b) } } sort(ll) barplot(tail(sort(ll))) # all of these are almost as good as the others, there is also no real advantage in using BCPEo (the best one) # no other parameters were fitted beyond the mean when trying to use stabsel, so there doesn't seem to be a point # so we revert back to NO here interceptmod <- gamlss2(obs_temp ~ 1, family=NO, data=preprocessed_data$step_12$training) logLik(interceptmod) coef(interceptmod) summary(interceptmod) plot(interceptmod) potential_predictors <- colnames(preprocessed_data$step_12$training) drop_elements <- list("obs_temp", "obs_wind", "obs_rain", "month", "year") potential_predictors <- potential_predictors[-which(potential_predictors %in% drop_elements)] f <- '~ s(month)' for (predictor in potential_predictors){ f <- paste0(f, " + s(", predictor, ")") } f <- rep(f, 2) f[[1]] <- paste0("obs_temp ", f[[1]]) library(bamlss) pick_pred_run <- stabsel(f, data=preprocessed_data$step_12$training, family=NO, q=10, thr=0.5, B=50) plot(pick_pred_run) result_f <- formula(pick_pred_run) print(result_f) # saving the model here manually just in case reruns of this won't give the same predictors result_f <- obs_temp ~ s(fdir) + s(lightningd6) + s(q850) + s(slhf) + s(sund) + s(t2m) + s(u10m) # only mu was fitted, so we don't need sigma in the following # we used step12 to choose model parameters and then fit the same model for each training set # write function that does the fitting and returns the loglikelihoods fit_compute_loglik <- function(dat){ mod <- gamlss2(result_f, family=NO, data=dat$training) p <- predict(mod, newdata=dat$test, type="parameter") loglik_res <- family(mod)$loglik(dat$test$obs_temp, p) return(loglik_res) } result_loglik <- lapply(preprocessed_data, fit_compute_loglik) sum(unlist(result_loglik)) ## -913.5339.