# Clay Ford
# 1/2/19
# Budget/Expenditure Modeling
# Purpose
# develop a model that can better predict library budget needs. There are
# multiple dimensions of this: (1) to identify what external changes generate
# additional costs for the library, and generate an estimate of the effect, (2)
# build a model that accurately forecasts budget needs
# ARL Data-Variables
# use Total Expenditures (TOTEXP) and Total Expenditures for Materials (EXPLM)
# as the key outcomes.
# Independent/predictor variables:
# University Data: Total full-time student enrollment (TOTSTU), Total graduate
# student enrollment (GRADSTU), Ph.D. s awarded (PHDAWD), Ph.D. fields (PHDFLD),
# Instructional faculty (FAC), University Type (TYPE), Law library included
# (LAW), Medical library included (MED); Not listed in codebook, but available
# in data file: Total part time students (TOTPT), Part-time graduate students
# (GRADPT)
#
# Supplemental Data: Masters degrees awarded (MSDA), government grants and
# contracts (GGACPE)
#
# Public Services Data: Total lending (ILLTOT), Total borrowing (ILBTOT),
# Initial circulations (INITCIRC), Reference transactions (REFTRANS)
# analysis ----------------------------------------------------------------
library(MASS)
library(effects)
## Loading required package: carData
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
library(bootStepAIC)
# load workspace
load("arl.Rdata")
# Choose a model by AIC in a Stepwise Algorithm
# Model totexp
mod_totexp <- lm(totexp/1e6 ~ ., data = arl_totexp)
step_totexp <- step(mod_totexp, trace = 0)
summary(step_totexp)
##
## Call:
## lm(formula = totexp/1e+06 ~ gradstu + phdfld + type + illtot +
## initcirc, data = arl_totexp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.968 -5.055 -0.116 3.986 43.822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.860e+00 3.095e+00 2.539 0.01279 *
## gradstu 1.520e-03 4.680e-04 3.248 0.00162 **
## phdfld 9.951e-02 4.685e-02 2.124 0.03636 *
## typeS -1.073e+01 2.272e+00 -4.723 8.30e-06 ***
## illtot 1.236e-04 5.705e-05 2.166 0.03286 *
## initcirc 6.247e-05 9.887e-06 6.319 9.27e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.23 on 92 degrees of freedom
## Multiple R-squared: 0.6945, Adjusted R-squared: 0.6779
## F-statistic: 41.82 on 5 and 92 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(step_totexp, id.n = 5)
# summary(influence.measures(step_totexp))
# arl_totexp["SUNY-ALBANY",]
# Model explm
mod_explm <- lm(explm/1e6 ~ ., data = arl_explm)
step_explm <- step(mod_explm, trace = 0)
summary(step_explm)
##
## Call:
## lm(formula = explm/1e+06 ~ phdfld + type + illtot + initcirc +
## msda, data = arl_explm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5203 -2.3470 -0.4247 1.8305 22.3411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.979e+00 1.464e+00 4.083 9.48e-05 ***
## phdfld 7.024e-02 1.955e-02 3.593 0.000528 ***
## typeS -5.535e+00 1.125e+00 -4.920 3.77e-06 ***
## illtot 5.641e-05 2.680e-05 2.105 0.038025 *
## initcirc 2.223e-05 4.575e-06 4.860 4.81e-06 ***
## msda 8.213e-04 4.463e-04 1.840 0.068993 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.807 on 92 degrees of freedom
## Multiple R-squared: 0.629, Adjusted R-squared: 0.6088
## F-statistic: 31.19 on 5 and 92 DF, p-value: < 2.2e-16
plot(step_explm, id.n = 5)
# summary(influence.measures(step_explm))
# arl_explm["SUNY-ALBANY",]
# robust regression
# iterated re-weighted least squares
rlm_totexp <- rlm(formula(step_totexp), data = arl_totexp)
summary(rlm_totexp)
##
## Call: rlm(formula = formula(step_totexp), data = arl_totexp)
## Residuals:
## Min 1Q Median 3Q Max
## -35.0649 -4.1739 -0.3742 4.3508 47.8578
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 7.1339 2.0676 3.4504
## gradstu 0.0014 0.0003 4.4338
## phdfld 0.0995 0.0313 3.1785
## typeS -7.2807 1.5177 -4.7972
## illtot 0.0001 0.0000 2.5437
## initcirc 0.0001 0.0000 9.0256
##
## Residual standard error: 6.287 on 92 degrees of freedom
plot(rlm_totexp, id.n=5)
rlm_explm <- rlm(formula(step_explm), data = arl_explm)
summary(rlm_explm)
##
## Call: rlm(formula = formula(step_explm), data = arl_explm)
## Residuals:
## Min 1Q Median 3Q Max
## -11.71362 -1.71092 -0.09153 1.53069 25.11442
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 5.1660 1.0131 5.0994
## phdfld 0.0615 0.0135 4.5490
## typeS -3.6573 0.7781 -4.7000
## illtot 0.0000 0.0000 2.4742
## initcirc 0.0000 0.0000 5.7141
## msda 0.0011 0.0003 3.6772
##
## Residual standard error: 2.436 on 92 degrees of freedom
plot(rlm_explm, id.n = 5)
# compare models
AIC(step_totexp, rlm_totexp)
## df AIC
## step_totexp 7 741.7183
## rlm_totexp 7 745.6867
AIC(step_explm, rlm_explm)
## df AIC
## step_explm 7 593.6496
## rlm_explm 7 599.0097
# the lm model seems to do as well as the robust model
# make prediction for uva
predict(mod_totexp, newdata = uva_totexp)
## VIRGINIA
## 28.70801
uva_totexp[["totexp"]]/1e6
## [1] 40.02785
predict(mod_explm, newdata = uva_explm)
## VIRGINIA
## 12.31278
uva_explm[["explm"]]/1e6
## [1] 14.03352
# some basic effect plots
plot(allEffects(step_totexp))
plot(allEffects(step_explm))
# Implement a Bootstrap procedure to investigate the variability of model
# selection under the step() stepwise algorithm
boot_totexp <- boot.stepAIC(mod_totexp, data = arl_totexp, B = 999)
boot_explm <- boot.stepAIC(mod_explm, data = arl_explm, B = 999)
# percentage of times each variable was selected
boot_totexp$Covariates
## (%)
## initcirc 99.79980
## type 80.68068
## phdfld 78.67868
## ggacpe 61.66166
## fac 55.55556
## msda 54.75475
## gradstu 50.65065
## illtot 43.24324
## gradpt 41.04104
## totstu 40.84084
## phdawd 39.03904
## med 37.83784
## ilbtot 35.83584
## totpt 34.73473
## law 33.93393
## reftrans 17.11712
# percentage of times regression coefficient of each variable had sign + and -
boot_totexp$Sign
## + (%) - (%)
## initcirc 100.0000000 0.0000000
## phdfld 100.0000000 0.0000000
## ggacpe 99.6753247 0.3246753
## fac 98.3783784 1.6216216
## law1 93.5103245 6.4896755
## ilbtot 91.3407821 8.6592179
## msda 87.3857404 12.6142596
## illtot 84.9537037 15.0462963
## gradstu 83.9920949 16.0079051
## phdawd 82.3076923 17.6923077
## reftrans 55.5555556 44.4444444
## totstu 34.8039216 65.1960784
## gradpt 34.3902439 65.6097561
## totpt 11.2391931 88.7608069
## med1 1.3227513 98.6772487
## typeS 0.1240695 99.8759305
# percentage of times the regression coefficient of each variable was
# significant under the alpha significance level
boot_totexp$Significance
## (%)
## initcirc 99.49850
## gradstu 84.58498
## typeS 83.74690
## phdfld 78.37150
## msda 78.06216
## ggacpe 72.88961
## totpt 72.33429
## fac 71.89189
## gradpt 68.53659
## totstu 66.66667
## illtot 65.97222
## ilbtot 62.56983
## phdawd 62.30769
## med1 50.26455
## law1 43.36283
## reftrans 31.57895
# Initial circulations (INITCIRC), University Type (TYPE), Total graduate
# student enrollment (GRADSTU), and Ph.D. fields (PHDFLD) all appear to be
# consistently explaining variability in Total Expenditures (TOTEXP)
# percentage of times each variable was selected
boot_explm$Covariates
## (%)
## initcirc 98.79880
## phdfld 91.59159
## type 87.18719
## msda 55.95596
## illtot 55.65566
## med 55.45546
## ggacpe 52.55255
## totstu 51.15115
## phdawd 49.14915
## law 46.14615
## fac 44.74474
## gradpt 39.13914
## gradstu 38.73874
## ilbtot 27.02703
## totpt 26.52653
## reftrans 21.22122
# percentage of times regression coefficient of each variable had sign + and -
boot_explm$Sign
## + (%) - (%)
## initcirc 100.0000000 0.0000000
## phdfld 100.0000000 0.0000000
## ggacpe 99.6190476 0.3809524
## fac 98.8814318 1.1185682
## illtot 98.7410072 1.2589928
## law1 98.2646421 1.7353579
## msda 92.1288014 7.8711986
## phdawd 89.2057026 10.7942974
## reftrans 74.0566038 25.9433962
## gradstu 54.5219638 45.4780362
## totpt 42.2641509 57.7358491
## ilbtot 37.0370370 62.9629630
## gradpt 35.5498721 64.4501279
## totstu 7.4363992 92.5636008
## med1 0.5415162 99.4584838
## typeS 0.1148106 99.8851894
# percentage of times the regression coefficient of each variable was
# significant under the alpha significance level
boot_explm$Significance
## (%)
## initcirc 96.55522
## phdfld 89.94536
## typeS 86.22273
## msda 74.59750
## illtot 74.10072
## totstu 71.42857
## phdawd 67.61711
## gradpt 65.47315
## ggacpe 65.33333
## gradstu 64.59948
## med1 59.02527
## law1 58.56833
## fac 55.03356
## totpt 53.58491
## ilbtot 45.92593
## reftrans 37.26415
# Initial circulations (INITCIRC), University Type (TYPE), and Ph.D. fields
# (PHDFLD) all appear to be consistently explaining variability in Total
# Expenditures for Materials (EXPLM)