library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ readr 1.3.1
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
library(leaps)
library(forecast)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
weather_train <- read.csv("weather_train.csv")
weather_train1 <- weather_train %>% filter(weather_train$site_id == 4)
weather_train1$timestamp <- as.Date(weather_train1$timestamp)
weather <- weather_train1 %>% group_by(timestamp = weather_train1$timestamp) %>% summarise(avg_temp = avg_temp <- mean(air_temperature,na.rm = TRUE),avg_cloud = avg_cloud <- mean(cloud_coverage,na.rm = TRUE),avg_dewtemp = avg_dewtemp <- mean(dew_temperature,na.rm = TRUE),avg_sea = avg_sea <- mean(sea_level_pressure,na.rm = TRUE),avg_wind = avg_wind <- mean(wind_speed,na.rm = TRUE))
Wrangling and joing data
Drop cloud due to null value
building_metadata <- read.csv("building_metadata.csv")
train <- read.csv("train.csv")
building <- dplyr::inner_join(building_metadata,train,by="building_id")
building <- building %>% filter(building$site_id == 4)
building$timestamp <- as.Date(building$timestamp)
building_group <- building %>% group_by(timestamp=building$timestamp) %>%
summarise(avg_meter_reading = avg_meter_reading <- mean(meter_reading,na.rm=TRUE))
final_df <- dplyr::inner_join(weather,building_group,by="timestamp")
print(final_df)
## # A tibble: 366 x 7
## timestamp avg_temp avg_cloud avg_dewtemp avg_sea avg_wind
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-01-01 6.06 0.957 -4.98 1021. 2.57
## 2 2016-01-02 7.53 4 -2.31 1019. 3.3
## 3 2016-01-03 9.3 NaN 0.679 1014. 3.14
## 4 2016-01-04 10.7 NaN 6.22 1008. 2.38
## 5 2016-01-05 11.6 NaN 9.45 1005. 5.61
## 6 2016-01-06 10.5 2.4 7.88 1007. 5.84
## 7 2016-01-07 9.68 3 7.10 1010. 3.07
## 8 2016-01-08 8.52 2.91 5.84 1018. 1.37
## 9 2016-01-09 10.7 NaN 7.15 1019. 2.89
## 10 2016-01-10 11.0 NaN 8.02 1022. 2.92
## # … with 356 more rows, and 1 more variable: avg_meter_reading <dbl>
final_df_drop_cloud <- final_df[,-3]
print(final_df_drop_cloud) #cause there are null in the variables
## # A tibble: 366 x 6
## timestamp avg_temp avg_dewtemp avg_sea avg_wind avg_meter_reading
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016-01-01 6.06 -4.98 1021. 2.57 155.
## 2 2016-01-02 7.53 -2.31 1019. 3.3 153.
## 3 2016-01-03 9.3 0.679 1014. 3.14 163.
## 4 2016-01-04 10.7 6.22 1008. 2.38 181.
## 5 2016-01-05 11.6 9.45 1005. 5.61 182.
## 6 2016-01-06 10.5 7.88 1007. 5.84 178.
## 7 2016-01-07 9.68 7.10 1010. 3.07 178.
## 8 2016-01-08 8.52 5.84 1018. 1.37 180.
## 9 2016-01-09 10.7 7.15 1019. 2.89 157.
## 10 2016-01-10 11.0 8.02 1022. 2.92 160.
## # … with 356 more rows
box-and-whisker plots for relevant and important variables.
par(mfrow=c(1,5))
boxplot(final_df_drop_cloud$avg_meter_reading,main="Meter Reading")
boxplot(final_df_drop_cloud$avg_temp,main="Temperature")
boxplot(final_df_drop_cloud$avg_dewtemp,main="Dew Temperature")
boxplot(final_df_drop_cloud$avg_sea,main="Sea Level")
boxplot(final_df_drop_cloud$avg_wind,main="Wind Speed")
The minimum, maximum, and average (mean, median, mode) and standard deviation / variance of important variables.
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Minimum <-c(min(final_df_drop_cloud$avg_meter_reading),min(final_df_drop_cloud$avg_temp),min(final_df_drop_cloud$avg_dewtemp),min(final_df_drop_cloud$avg_sea),min(final_df_drop_cloud$avg_wind))
Maximum <- c(max(final_df_drop_cloud$avg_meter_reading),max(final_df_drop_cloud$avg_temp),max(final_df_drop_cloud$avg_dewtemp),max(final_df_drop_cloud$avg_sea),max(final_df_drop_cloud$avg_wind))
Mean <- c(mean(final_df_drop_cloud$avg_meter_reading),mean(final_df_drop_cloud$avg_temp),mean(final_df_drop_cloud$avg_dewtemp),mean(final_df_drop_cloud$avg_sea),mean(final_df_drop_cloud$avg_wind))
Median <- c(median(final_df_drop_cloud$avg_meter_reading),median(final_df_drop_cloud$avg_temp),median(final_df_drop_cloud$avg_dewtemp),median(final_df_drop_cloud$avg_sea),median(final_df_drop_cloud$avg_wind))
Sd <- c(sd(final_df_drop_cloud$avg_meter_reading),sd(final_df_drop_cloud$avg_temp),sd(final_df_drop_cloud$avg_dewtemp),sd(final_df_drop_cloud$avg_sea),sd(final_df_drop_cloud$avg_wind)) # bring all together
#names(table) <- c("Minimum","Maximum","Average", "Median","Standard Deviation")
all = data.frame(Minimum,Maximum,Mean,Median,Sd)
row.names(all)<-c("Meter Reading","Temperature","Dew Temperature","Sea Level","Wind Speed")
all %>%
knitr::kable(digits=3,align='c')%>%
kable_styling()
Minimum | Maximum | Mean | Median | Sd | |
---|---|---|---|---|---|
Meter Reading | 140.790 | 208.407 | 181.223 | 183.486 | 12.591 |
Temperature | 6.057 | 23.867 | 15.235 | 15.842 | 2.965 |
Dew Temperature | -4.983 | 16.554 | 9.894 | 10.521 | 3.309 |
Sea Level | 1004.883 | 1030.017 | 1016.881 | 1016.650 | 4.316 |
Wind Speed | 1.075 | 10.663 | 3.858 | 3.819 | 1.596 |
pairs(final_df_drop_cloud[,2:6])
Create train and validation datasets
# select variables for regression
#selected.var <- c(3, 4, 7, 8, 9, 10, 12, 13, 14, 17, 18)
# partition data
set.seed(1) # set seed for reproducing the partition
train.index <- sample(c(1:366), 220)
#Create and set aside the remaining 40% of the data, to be used after omitting unhelpful data points and unnecessary variables.
train.t <- final_df_drop_cloud[train.index,]
valid.t <- final_df_drop_cloud[-train.index,]
Try Step wise selection (Both) Elect this method (High R-Square) and Show (residual histogram, QQ plot, Residual vs. Fitted Value)
### both
reg <- lm(avg_meter_reading ~. -timestamp, data = final_df_drop_cloud )
summary(reg)
##
## Call:
## lm(formula = avg_meter_reading ~ . - timestamp, data = final_df_drop_cloud)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.045 -9.984 2.125 7.721 24.314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -505.4848 178.9070 -2.825 0.004985 **
## avg_temp 2.2688 0.3236 7.010 1.18e-11 ***
## avg_dewtemp -0.2163 0.2808 -0.770 0.441573
## avg_sea 0.6454 0.1738 3.714 0.000237 ***
## avg_wind -0.5268 0.4181 -1.260 0.208459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.38 on 361 degrees of freedom
## Multiple R-squared: 0.1921, Adjusted R-squared: 0.1831
## F-statistic: 21.45 on 4 and 361 DF, p-value: 6.823e-16
vif(reg)
## avg_temp avg_dewtemp avg_sea avg_wind
## 2.595872 2.433845 1.585943 1.255538
reg.step <- step(reg, direction = "both")
## Start: AIC=1785.08
## avg_meter_reading ~ (timestamp + avg_temp + avg_dewtemp + avg_sea +
## avg_wind) - timestamp
##
## Df Sum of Sq RSS AIC
## - avg_dewtemp 1 76.9 46827 1783.7
## - avg_wind 1 205.6 46956 1784.7
## <none> 46750 1785.1
## - avg_sea 1 1785.9 48536 1796.8
## - avg_temp 1 6364.1 53114 1829.8
##
## Step: AIC=1783.68
## avg_meter_reading ~ avg_temp + avg_sea + avg_wind
##
## Df Sum of Sq RSS AIC
## - avg_wind 1 186.6 47014 1783.1
## <none> 46827 1783.7
## + avg_dewtemp 1 76.9 46750 1785.1
## - avg_sea 1 1903.7 48731 1796.3
## - avg_temp 1 10769.5 57597 1857.4
##
## Step: AIC=1783.13
## avg_meter_reading ~ avg_temp + avg_sea
##
## Df Sum of Sq RSS AIC
## <none> 47014 1783.1
## + avg_wind 1 186.6 46827 1783.7
## + avg_dewtemp 1 57.8 46956 1784.7
## - avg_sea 1 2897.3 49911 1803.0
## - avg_temp 1 10833.9 57848 1857.0
summary(reg.step)
##
## Call:
## lm(formula = avg_meter_reading ~ avg_temp + avg_sea, data = final_df_drop_cloud)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.598 -9.075 2.071 7.997 25.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -609.0156 162.0385 -3.758 0.000199 ***
## avg_temp 2.0987 0.2295 9.146 < 2e-16 ***
## avg_sea 0.7457 0.1577 4.730 3.22e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.38 on 363 degrees of freedom
## Multiple R-squared: 0.1875, Adjusted R-squared: 0.183
## F-statistic: 41.89 on 2 and 363 DF, p-value: < 2.2e-16
vif(reg.step)
## avg_temp avg_sea
## 1.304907 1.304907
reg.step.pred <- predict(reg.step, valid.t)
accuracy(reg.step.pred, valid.t$avg_meter_reading)
## ME RMSE MAE MPE MAPE
## Test set -0.5353107 12.16403 10.17624 -0.7690919 5.791347
par(mfrow=c(2,2))
plot(reg.step)
all.residuals <- valid.t$avg_meter_reading - reg.step.pred
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.pred, "Actual" = valid.t$avg_meter_reading,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 166.8878 152.7763 -14.111406
## 2 166.2675 162.7163 -3.551230
## 3 165.2673 180.8357 15.568466
## 4 164.6651 181.7982 17.133067
## 5 163.7626 178.3992 14.636588
## 6 164.6746 178.0040 13.329453
## 7 167.8237 180.0174 12.193712
## 8 176.0162 160.3165 -15.699700
## 9 179.0389 181.2249 2.185982
## 10 179.7029 181.7119 2.008991
## 11 180.1603 176.9364 -3.223907
## 12 180.0873 193.8463 13.759031
## 13 177.6272 162.4589 -15.168271
## 14 179.0858 192.3999 13.314098
## 15 170.8637 190.6150 19.751318
## 16 174.3828 191.6349 17.252130
## 17 188.3456 190.8557 2.510106
## 18 182.4843 208.4073 25.923056
## 19 177.8332 169.3656 -8.467530
## 20 179.2557 198.3520 19.096313
Try foward selection
### forward selection
reg.null <- lm(avg_meter_reading~1 -timestamp, data = train.t)
# use step() to run forward regression.
reg.step.forward <- step(reg.null, scope=list(lower=reg.null, upper=reg), direction = "forward")
## Start: AIC=1090.41
## avg_meter_reading ~ 1 - timestamp
##
## Df Sum of Sq RSS AIC
## + avg_temp 1 3442.2 27533 1066.5
## + avg_dewtemp 1 842.5 30133 1086.3
## <none> 30975 1090.4
## + avg_sea 1 10.9 30964 1092.3
## + avg_wind 1 0.1 30975 1092.4
##
## Step: AIC=1066.49
## avg_meter_reading ~ avg_temp
##
## Df Sum of Sq RSS AIC
## + avg_sea 1 2209.76 25323 1050.1
## + avg_dewtemp 1 324.17 27209 1065.9
## <none> 27533 1066.5
## + avg_wind 1 207.34 27326 1066.8
##
## Step: AIC=1050.09
## avg_meter_reading ~ avg_temp + avg_sea
##
## Df Sum of Sq RSS AIC
## + avg_dewtemp 1 258.77 25064 1049.8
## <none> 25323 1050.1
## + avg_wind 1 31.31 25292 1051.8
##
## Step: AIC=1049.83
## avg_meter_reading ~ avg_temp + avg_sea + avg_dewtemp
##
## Df Sum of Sq RSS AIC
## <none> 25064 1049.8
## + avg_wind 1 19.143 25045 1051.7
summary(reg.step.forward)
##
## Call:
## lm(formula = avg_meter_reading ~ avg_temp + avg_sea + avg_dewtemp,
## data = train.t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.496 -9.043 1.838 7.214 23.714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -746.1867 210.5186 -3.545 0.000482 ***
## avg_temp 2.6520 0.4148 6.394 9.86e-10 ***
## avg_sea 0.8775 0.2041 4.299 2.60e-05 ***
## avg_dewtemp -0.4953 0.3316 -1.493 0.136812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 216 degrees of freedom
## Multiple R-squared: 0.1908, Adjusted R-squared: 0.1796
## F-statistic: 16.98 on 3 and 216 DF, p-value: 6.137e-10
reg.step.forward.pred <- predict(reg.step.forward, valid.t)
accuracy(reg.step.forward.pred, valid.t$avg_meter_reading)
## ME RMSE MAE MPE MAPE
## Test set -0.9255571 12.34046 10.20431 -0.9818985 5.826987
vif(reg.step.forward)
## avg_temp avg_sea avg_dewtemp
## 2.496889 1.537515 2.023579
par(mfrow=c(2,2))
plot(reg.step.forward)
all.residuals <- valid.t$avg_meter_reading - reg.step.forward.pred
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.forward.pred, "Actual" = valid.t$avg_meter_reading,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 169.3621 152.7763 -16.5857690
## 2 167.4748 162.7163 -4.7585269
## 3 163.8091 180.8357 17.0266624
## 4 161.6684 181.7982 20.1297933
## 5 161.1773 178.3992 17.2218351
## 6 162.4909 178.0040 15.5131701
## 7 166.6085 180.0174 13.4088824
## 8 175.6293 160.3165 -15.3128147
## 9 179.5288 181.2249 1.6960456
## 10 180.2468 181.7119 1.4650774
## 11 178.8350 176.9364 -1.8986676
## 12 180.0963 193.8463 13.7499905
## 13 176.9519 162.4589 -14.4929523
## 14 179.1550 192.3999 13.2449231
## 15 173.9642 190.6150 16.6508580
## 16 175.5009 191.6349 16.1340036
## 17 190.0730 190.8557 0.7827144
## 18 183.2199 208.4073 25.1874878
## 19 178.9895 169.3656 -9.6238977
## 20 180.7743 198.3520 17.5777561
Try backward selection
### backward
reg.step.back <- step(reg, direction = "backward")
## Start: AIC=1785.08
## avg_meter_reading ~ (timestamp + avg_temp + avg_dewtemp + avg_sea +
## avg_wind) - timestamp
##
## Df Sum of Sq RSS AIC
## - avg_dewtemp 1 76.9 46827 1783.7
## - avg_wind 1 205.6 46956 1784.7
## <none> 46750 1785.1
## - avg_sea 1 1785.9 48536 1796.8
## - avg_temp 1 6364.1 53114 1829.8
##
## Step: AIC=1783.68
## avg_meter_reading ~ avg_temp + avg_sea + avg_wind
##
## Df Sum of Sq RSS AIC
## - avg_wind 1 186.6 47014 1783.1
## <none> 46827 1783.7
## - avg_sea 1 1903.7 48731 1796.3
## - avg_temp 1 10769.5 57597 1857.4
##
## Step: AIC=1783.13
## avg_meter_reading ~ avg_temp + avg_sea
##
## Df Sum of Sq RSS AIC
## <none> 47014 1783.1
## - avg_sea 1 2897.3 49911 1803.0
## - avg_temp 1 10833.9 57848 1857.0
summary(reg.step.back)
##
## Call:
## lm(formula = avg_meter_reading ~ avg_temp + avg_sea, data = final_df_drop_cloud)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.598 -9.075 2.071 7.997 25.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -609.0156 162.0385 -3.758 0.000199 ***
## avg_temp 2.0987 0.2295 9.146 < 2e-16 ***
## avg_sea 0.7457 0.1577 4.730 3.22e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.38 on 363 degrees of freedom
## Multiple R-squared: 0.1875, Adjusted R-squared: 0.183
## F-statistic: 41.89 on 2 and 363 DF, p-value: < 2.2e-16
reg.step.back.pred <- predict(reg.step.back, valid.t)
accuracy(reg.step.back.pred, valid.t$avg_meter_reading)
## ME RMSE MAE MPE MAPE
## Test set -0.5353107 12.16403 10.17624 -0.7690919 5.791347
vif(reg.step.back)
## avg_temp avg_sea
## 1.304907 1.304907
par(mfrow=c(2,2))
plot(reg.step.back)
all.residuals<- valid.t$avg_meter_reading - reg.step.back.pred
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.back.pred, "Actual" = valid.t$avg_meter_reading,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 166.8878 152.7763 -14.111406
## 2 166.2675 162.7163 -3.551230
## 3 165.2673 180.8357 15.568466
## 4 164.6651 181.7982 17.133067
## 5 163.7626 178.3992 14.636588
## 6 164.6746 178.0040 13.329453
## 7 167.8237 180.0174 12.193712
## 8 176.0162 160.3165 -15.699700
## 9 179.0389 181.2249 2.185982
## 10 179.7029 181.7119 2.008991
## 11 180.1603 176.9364 -3.223907
## 12 180.0873 193.8463 13.759031
## 13 177.6272 162.4589 -15.168271
## 14 179.0858 192.3999 13.314098
## 15 170.8637 190.6150 19.751318
## 16 174.3828 191.6349 17.252130
## 17 188.3456 190.8557 2.510106
## 18 182.4843 208.4073 25.923056
## 19 177.8332 169.3656 -8.467530
## 20 179.2557 198.3520 19.096313
Try exhaustive search
###exhautive
search.train <- regsubsets(avg_meter_reading ~. -timestamp , data = train.t, nbest = 1, nvmax = dim(train.t)[2],
method = "exhaustive")
sum <- summary(search.train)
search.valid <- regsubsets(avg_meter_reading ~. -timestamp , data = valid.t, nbest = 1, nvmax = dim(valid.t)[2],
method = "exhaustive")
sum <- summary(search.valid)
# show models
sum$which
## (Intercept) avg_temp avg_dewtemp avg_sea avg_wind
## 1 TRUE TRUE FALSE FALSE FALSE
## 2 TRUE TRUE FALSE FALSE TRUE
## 3 TRUE TRUE FALSE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE TRUE
# show metrics
sum$rsq
## [1] 0.1694772 0.2248555 0.2313469 0.2321820
sum$adjr2
## [1] 0.1637097 0.2140143 0.2151078 0.2103999
sum$Cp
## NULL
Attempt to add interection term and polynomial term
reg_removed_interaction <- lm(avg_meter_reading ~. -timestamp + avg_temp*avg_sea, data = final_df_drop_cloud[-c(366),] )
reg.step.removed.interaction<- step(reg_removed_interaction, direction = "both")
## Start: AIC=1769.07
## avg_meter_reading ~ (timestamp + avg_temp + avg_dewtemp + avg_sea +
## avg_wind) - timestamp + avg_temp * avg_sea
##
## Df Sum of Sq RSS AIC
## <none> 44971 1769.1
## - avg_dewtemp 1 265.46 45237 1769.2
## - avg_wind 1 362.55 45334 1770.0
## - avg_temp:avg_sea 1 1033.35 46005 1775.4
summary(reg.step.removed.interaction)
##
## Call:
## lm(formula = avg_meter_reading ~ (timestamp + avg_temp + avg_dewtemp +
## avg_sea + avg_wind) - timestamp + avg_temp * avg_sea, data = final_df_drop_cloud[-c(366),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.728 -9.804 2.157 7.975 29.958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1335.43938 651.64994 2.049 0.04116 *
## avg_temp -125.84991 44.59251 -2.822 0.00503 **
## avg_dewtemp -0.41178 0.28287 -1.456 0.14634
## avg_sea -1.16198 0.63961 -1.817 0.07010 .
## avg_wind -0.70634 0.41519 -1.701 0.08976 .
## avg_temp:avg_sea 0.12602 0.04388 2.872 0.00432 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.19 on 359 degrees of freedom
## Multiple R-squared: 0.202, Adjusted R-squared: 0.1909
## F-statistic: 18.17 on 5 and 359 DF, p-value: 4.418e-16
vif(reg.step.removed.interaction)
## avg_temp avg_dewtemp avg_sea avg_wind
## 50595.238168 2.538012 22.173636 1.280042
## avg_temp:avg_sea
## 49746.324890
reg.step.pred.removed.int<- predict(reg.step.removed.interaction, valid.t)
accuracy(reg.step.pred.removed.int, valid.t$avg_meter_reading)
## ME RMSE MAE MPE MAPE
## Test set -0.910656 11.92183 9.976769 -0.9710461 5.700151
par(mfrow=c(2,2))
plot(reg.step.removed.interaction)
all.residuals <- valid.t$avg_meter_reading - reg.step.pred.removed.int
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.pred.removed.int, "Actual" = valid.t$avg_meter_reading,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 169.2701 152.7763 -16.4937532
## 2 172.7049 162.7163 -9.9886535
## 3 172.5948 180.8357 8.2409661
## 4 169.0780 181.7982 12.7201964
## 5 168.9951 178.3992 9.4041068
## 6 170.6338 178.0040 7.3702737
## 7 169.9806 180.0174 10.0367889
## 8 174.9719 160.3165 -14.6554318
## 9 177.7839 181.2249 3.4409756
## 10 178.6682 181.7119 3.0437234
## 11 180.0261 176.9364 -3.0897771
## 12 180.4000 193.8463 13.4462530
## 13 176.4142 162.4589 -13.9552534
## 14 176.4721 192.3999 15.9277591
## 15 173.2380 190.6150 17.3770810
## 16 168.0975 191.6349 23.5374039
## 17 191.3305 190.8557 -0.4747984
## 18 185.0932 208.4073 23.3141402
## 19 177.4851 169.3656 -8.1194405
## 20 182.3301 198.3520 16.0219395
Elect to use Both method Removing outlier with outlier test
outlierTest(reg.step)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 366 -2.372933 0.018169 NA
reg_removed <- lm(avg_meter_reading ~. -timestamp, data = final_df_drop_cloud[-c(366),] )
reg.step.removed <- step(reg_removed, direction = "both")
## Start: AIC=1775.36
## avg_meter_reading ~ (timestamp + avg_temp + avg_dewtemp + avg_sea +
## avg_wind) - timestamp
##
## Df Sum of Sq RSS AIC
## - avg_dewtemp 1 92.8 46098 1774.1
## - avg_wind 1 217.7 46222 1775.1
## <none> 46005 1775.4
## - avg_sea 1 1570.4 47575 1785.6
## - avg_temp 1 6080.7 52085 1818.7
##
## Step: AIC=1774.1
## avg_meter_reading ~ avg_temp + avg_sea + avg_wind
##
## Df Sum of Sq RSS AIC
## - avg_wind 1 196.0 46294 1773.6
## <none> 46098 1774.1
## + avg_dewtemp 1 92.8 46005 1775.4
## - avg_sea 1 1690.8 47788 1785.2
## - avg_temp 1 9986.5 56084 1843.7
##
## Step: AIC=1773.64
## avg_meter_reading ~ avg_temp + avg_sea
##
## Df Sum of Sq RSS AIC
## <none> 46294 1773.6
## + avg_wind 1 196.0 46098 1774.1
## + avg_dewtemp 1 71.1 46222 1775.1
## - avg_sea 1 2622.3 48916 1791.8
## - avg_temp 1 10054.1 56348 1843.4
summary(reg.step.removed)
##
## Call:
## lm(formula = avg_meter_reading ~ avg_temp + avg_sea, data = final_df_drop_cloud[-c(366),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.981 -9.016 2.132 7.863 25.873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -574.0102 161.6891 -3.550 0.000436 ***
## avg_temp 2.0355 0.2296 8.867 < 2e-16 ***
## avg_sea 0.7123 0.1573 4.528 8.08e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.31 on 362 degrees of freedom
## Multiple R-squared: 0.1785, Adjusted R-squared: 0.174
## F-statistic: 39.34 on 2 and 362 DF, p-value: 3.481e-16
vif(reg.step.removed)
## avg_temp avg_sea
## 1.313558 1.313558
reg.step.pred.removed<- predict(reg.step.removed, valid.t)
accuracy(reg.step.pred.removed, valid.t$avg_meter_reading)
## ME RMSE MAE MPE MAPE
## Test set -0.6163067 12.16049 10.17427 -0.8182031 5.794396
par(mfrow=c(2,2))
plot(reg.step.removed)
all.residuals <- valid.t$avg_meter_reading - reg.step.pred.removed
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "")
data.frame("Predicted" = reg.step.pred.removed, "Actual" = valid.t$avg_meter_reading,
"Residual" = all.residuals)[0:20,]
## Predicted Actual Residual
## 1 167.3666 152.7763 -14.590208
## 2 166.8286 162.7163 -4.112265
## 3 165.9160 180.8357 14.919732
## 4 165.3691 181.7982 16.429108
## 5 164.4727 178.3992 13.926484
## 6 165.3185 178.0040 12.685502
## 7 168.2909 180.0174 11.726532
## 8 176.1941 160.3165 -15.877564
## 9 179.0925 181.2249 2.132355
## 10 179.7401 181.7119 1.971755
## 11 180.2374 176.9364 -3.301008
## 12 180.1400 193.8463 13.706269
## 13 177.8149 162.4589 -15.356023
## 14 179.1241 192.3999 13.275800
## 15 171.2463 190.6150 19.368703
## 16 174.5499 191.6349 17.085076
## 17 188.1516 190.8557 2.704019
## 18 182.5346 208.4073 25.872761
## 19 177.9367 169.3656 -8.571060
## 20 179.3718 198.3520 18.980230