# Clay Ford
# 1/2/19


# Budget/Expenditure Modeling

# Purpose

# develop a model that can better predict library budget needs. There are
# multiple dimensions of this: (1) to identify what external changes generate
# additional costs for the library, and generate an estimate of the effect, (2)
# build a model that accurately forecasts budget needs

# ARL Data-Variables

# use Total Expenditures (TOTEXP) and Total Expenditures for Materials (EXPLM)
# as the key outcomes. 


# Independent/predictor variables:

# University Data: Total full-time student enrollment (TOTSTU), Total graduate
# student enrollment (GRADSTU), Ph.D. s awarded (PHDAWD), Ph.D. fields (PHDFLD),
# Instructional faculty (FAC), University Type (TYPE), Law library included
# (LAW), Medical library included (MED); Not listed in codebook, but available
# in data file: Total part time students (TOTPT), Part-time graduate students
# (GRADPT)
#
# Supplemental Data: Masters degrees awarded (MSDA), government grants and
# contracts (GGACPE)
#
# Public Services Data: Total lending (ILLTOT), Total borrowing (ILBTOT),
# Initial circulations (INITCIRC), Reference transactions (REFTRANS)



# analysis ----------------------------------------------------------------

library(MASS)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(bootStepAIC)

# load workspace
load("arl.Rdata")

# Choose a model by AIC in a Stepwise Algorithm

# Model totexp
mod_totexp <- lm(totexp/1e6 ~ ., data = arl_totexp)
step_totexp <- step(mod_totexp, trace = 0)
summary(step_totexp)
## 
## Call:
## lm(formula = totexp/1e+06 ~ gradstu + phdfld + type + illtot + 
##     initcirc, data = arl_totexp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.968  -5.055  -0.116   3.986  43.822 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.860e+00  3.095e+00   2.539  0.01279 *  
## gradstu      1.520e-03  4.680e-04   3.248  0.00162 ** 
## phdfld       9.951e-02  4.685e-02   2.124  0.03636 *  
## typeS       -1.073e+01  2.272e+00  -4.723 8.30e-06 ***
## illtot       1.236e-04  5.705e-05   2.166  0.03286 *  
## initcirc     6.247e-05  9.887e-06   6.319 9.27e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.23 on 92 degrees of freedom
## Multiple R-squared:  0.6945, Adjusted R-squared:  0.6779 
## F-statistic: 41.82 on 5 and 92 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(step_totexp, id.n = 5)

# summary(influence.measures(step_totexp))
# arl_totexp["SUNY-ALBANY",]

# Model explm
mod_explm <- lm(explm/1e6 ~ ., data = arl_explm)
step_explm <- step(mod_explm, trace = 0)
summary(step_explm)
## 
## Call:
## lm(formula = explm/1e+06 ~ phdfld + type + illtot + initcirc + 
##     msda, data = arl_explm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5203  -2.3470  -0.4247   1.8305  22.3411 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.979e+00  1.464e+00   4.083 9.48e-05 ***
## phdfld       7.024e-02  1.955e-02   3.593 0.000528 ***
## typeS       -5.535e+00  1.125e+00  -4.920 3.77e-06 ***
## illtot       5.641e-05  2.680e-05   2.105 0.038025 *  
## initcirc     2.223e-05  4.575e-06   4.860 4.81e-06 ***
## msda         8.213e-04  4.463e-04   1.840 0.068993 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.807 on 92 degrees of freedom
## Multiple R-squared:  0.629,  Adjusted R-squared:  0.6088 
## F-statistic: 31.19 on 5 and 92 DF,  p-value: < 2.2e-16
plot(step_explm, id.n = 5)

# summary(influence.measures(step_explm))
# arl_explm["SUNY-ALBANY",]

# robust regression
# iterated re-weighted least squares 
rlm_totexp <- rlm(formula(step_totexp), data = arl_totexp)
summary(rlm_totexp)
## 
## Call: rlm(formula = formula(step_totexp), data = arl_totexp)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -35.0649  -4.1739  -0.3742   4.3508  47.8578 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept)  7.1339  2.0676     3.4504
## gradstu      0.0014  0.0003     4.4338
## phdfld       0.0995  0.0313     3.1785
## typeS       -7.2807  1.5177    -4.7972
## illtot       0.0001  0.0000     2.5437
## initcirc     0.0001  0.0000     9.0256
## 
## Residual standard error: 6.287 on 92 degrees of freedom
plot(rlm_totexp, id.n=5)

rlm_explm <- rlm(formula(step_explm), data = arl_explm)
summary(rlm_explm)
## 
## Call: rlm(formula = formula(step_explm), data = arl_explm)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -11.71362  -1.71092  -0.09153   1.53069  25.11442 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept)  5.1660  1.0131     5.0994
## phdfld       0.0615  0.0135     4.5490
## typeS       -3.6573  0.7781    -4.7000
## illtot       0.0000  0.0000     2.4742
## initcirc     0.0000  0.0000     5.7141
## msda         0.0011  0.0003     3.6772
## 
## Residual standard error: 2.436 on 92 degrees of freedom
plot(rlm_explm, id.n = 5)

# compare models
AIC(step_totexp, rlm_totexp)
##             df      AIC
## step_totexp  7 741.7183
## rlm_totexp   7 745.6867
AIC(step_explm, rlm_explm)
##            df      AIC
## step_explm  7 593.6496
## rlm_explm   7 599.0097
# the lm model seems to do as well as the robust model

# make prediction for uva
predict(mod_totexp, newdata = uva_totexp)
## VIRGINIA 
## 28.70801
uva_totexp[["totexp"]]/1e6
## [1] 40.02785
predict(mod_explm, newdata = uva_explm)
## VIRGINIA 
## 12.31278
uva_explm[["explm"]]/1e6
## [1] 14.03352
# some basic effect plots
plot(allEffects(step_totexp)) 

plot(allEffects(step_explm))

# Implement a Bootstrap procedure to investigate the variability of model
# selection under the step() stepwise algorithm 

boot_totexp <- boot.stepAIC(mod_totexp, data = arl_totexp, B = 999)
boot_explm <- boot.stepAIC(mod_explm, data = arl_explm, B = 999)

# percentage of times each variable was selected
boot_totexp$Covariates
##               (%)
## initcirc 99.79980
## type     80.68068
## phdfld   78.67868
## ggacpe   61.66166
## fac      55.55556
## msda     54.75475
## gradstu  50.65065
## illtot   43.24324
## gradpt   41.04104
## totstu   40.84084
## phdawd   39.03904
## med      37.83784
## ilbtot   35.83584
## totpt    34.73473
## law      33.93393
## reftrans 17.11712
# percentage of times regression coefficient of each variable had sign + and -
boot_totexp$Sign
##                + (%)      - (%)
## initcirc 100.0000000  0.0000000
## phdfld   100.0000000  0.0000000
## ggacpe    99.6753247  0.3246753
## fac       98.3783784  1.6216216
## law1      93.5103245  6.4896755
## ilbtot    91.3407821  8.6592179
## msda      87.3857404 12.6142596
## illtot    84.9537037 15.0462963
## gradstu   83.9920949 16.0079051
## phdawd    82.3076923 17.6923077
## reftrans  55.5555556 44.4444444
## totstu    34.8039216 65.1960784
## gradpt    34.3902439 65.6097561
## totpt     11.2391931 88.7608069
## med1       1.3227513 98.6772487
## typeS      0.1240695 99.8759305
#  percentage of times the regression coefficient of each variable was
#  significant under the alpha significance level
boot_totexp$Significance
##               (%)
## initcirc 99.49850
## gradstu  84.58498
## typeS    83.74690
## phdfld   78.37150
## msda     78.06216
## ggacpe   72.88961
## totpt    72.33429
## fac      71.89189
## gradpt   68.53659
## totstu   66.66667
## illtot   65.97222
## ilbtot   62.56983
## phdawd   62.30769
## med1     50.26455
## law1     43.36283
## reftrans 31.57895
# Initial circulations (INITCIRC), University Type (TYPE), Total graduate
# student enrollment (GRADSTU), and  Ph.D. fields (PHDFLD) all appear to be
# consistently explaining variability in Total Expenditures (TOTEXP)


# percentage of times each variable was selected
boot_explm$Covariates
##               (%)
## initcirc 98.79880
## phdfld   91.59159
## type     87.18719
## msda     55.95596
## illtot   55.65566
## med      55.45546
## ggacpe   52.55255
## totstu   51.15115
## phdawd   49.14915
## law      46.14615
## fac      44.74474
## gradpt   39.13914
## gradstu  38.73874
## ilbtot   27.02703
## totpt    26.52653
## reftrans 21.22122
# percentage of times regression coefficient of each variable had sign + and -
boot_explm$Sign
##                + (%)      - (%)
## initcirc 100.0000000  0.0000000
## phdfld   100.0000000  0.0000000
## ggacpe    99.6190476  0.3809524
## fac       98.8814318  1.1185682
## illtot    98.7410072  1.2589928
## law1      98.2646421  1.7353579
## msda      92.1288014  7.8711986
## phdawd    89.2057026 10.7942974
## reftrans  74.0566038 25.9433962
## gradstu   54.5219638 45.4780362
## totpt     42.2641509 57.7358491
## ilbtot    37.0370370 62.9629630
## gradpt    35.5498721 64.4501279
## totstu     7.4363992 92.5636008
## med1       0.5415162 99.4584838
## typeS      0.1148106 99.8851894
#  percentage of times the regression coefficient of each variable was
#  significant under the alpha significance level
boot_explm$Significance
##               (%)
## initcirc 96.55522
## phdfld   89.94536
## typeS    86.22273
## msda     74.59750
## illtot   74.10072
## totstu   71.42857
## phdawd   67.61711
## gradpt   65.47315
## ggacpe   65.33333
## gradstu  64.59948
## med1     59.02527
## law1     58.56833
## fac      55.03356
## totpt    53.58491
## ilbtot   45.92593
## reftrans 37.26415
# Initial circulations (INITCIRC), University Type (TYPE), and  Ph.D. fields
# (PHDFLD) all appear to be consistently explaining variability in Total
# Expenditures for Materials (EXPLM)