Regresión Lineal Múltiple
library(readr)
data=read_delim("data/ausentismo.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)
cov(data)
id ausen taller sexo edad
id 196.0000000 -17.5957447 1.08510638 0.94680851 39.2021277
ausen -17.5957447 14.3404255 0.42553191 -0.20212766 -34.8404255
taller 1.0851064 0.4255319 0.19148936 0.01595745 -0.8882979
sexo 0.9468085 -0.2021277 0.01595745 0.25132979 0.2380319
edad 39.2021277 -34.8404255 -0.88829787 0.23803191 185.6130319
antg 18.2127660 -27.7446809 -0.50000000 1.32712766 121.6515957
sala 3327.0212766 -1223.1914894 -22.34042553 118.24468085 2081.7553191
antg sala
id 18.212766 3327.02128
ausen -27.744681 -1223.19149
taller -0.500000 -22.34043
sexo 1.327128 118.24468
edad 121.651596 2081.75532
antg 104.920213 2175.42553
sala 2175.425532 234470.21277
cor(data)
id ausen taller sexo edad antg
id 1.0000000 -0.3318934 0.1771218 0.1349001 0.2055310 0.1270043
ausen -0.3318934 1.0000000 0.2567904 -0.1064689 -0.6753023 -0.7152686
taller 0.1771218 0.2567904 1.0000000 0.0727393 -0.1489986 -0.1115496
sexo 0.1349001 -0.1064689 0.0727393 1.0000000 0.0348505 0.2584408
edad 0.2055310 -0.6753023 -0.1489986 0.0348505 1.0000000 0.8717340
antg 0.1270043 -0.7152686 -0.1115496 0.2584408 0.8717340 1.0000000
sala 0.4907764 -0.6670672 -0.1054327 0.4870974 0.3155598 0.4386022
sala
id 0.4907764
ausen -0.6670672
taller -0.1054327
sexo 0.4870974
edad 0.3155598
antg 0.4386022
sala 1.0000000
# Estimacion del modelo
attach(data)
modelo1=lm(ausen ~ taller + sexo + edad + antg + sala , data=data)
summary(modelo1)
Call:
lm(formula = ausen ~ taller + sexo + edad + antg + sala, data = data)
Residuals:
Min 1Q Median 3Q Max
-7.0713 -0.5383 0.3031 0.9391 3.5793
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.4436075 1.6404323 7.586 2.14e-09 ***
taller 0.9684600 0.6688242 1.448 0.15504
sexo 2.0492914 0.7122235 2.877 0.00628 **
edad -0.0372111 0.0469913 -0.792 0.43288
antg -0.1507700 0.0652833 -2.309 0.02590 *
sala -0.0044288 0.0007348 -6.027 3.63e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.964 on 42 degrees of freedom
Multiple R-squared: 0.7597, Adjusted R-squared: 0.7311
F-statistic: 26.56 on 5 and 42 DF, p-value: 5.282e-12
# diagnostico
coefficients(modelo1) # coeficientes estimados
(Intercept) taller sexo edad antg sala
12.443607478 0.968459990 2.049291411 -0.037211075 -0.150770045 -0.004428793
yhat=fitted(modelo1) # valores estimados
u=residuals(modelo1) # residuales
anova(modelo1) # tabla de anova
Analysis of Variance Table
Response: ausen
Df Sum Sq Mean Sq F value Pr(>F)
taller 1 44.444 44.444 11.5262 0.001510 **
sexo 1 10.612 10.612 2.7522 0.104573
edad 1 275.299 275.299 71.3956 1.331e-10 ***
antg 1 41.613 41.613 10.7919 0.002062 **
sala 1 140.080 140.080 36.3283 3.629e-07 ***
Residuals 42 161.950 3.856
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(modelo1) # matriz de varianzas covarianza de parametros
(Intercept) taller sexo edad
(Intercept) 2.6910180499 -0.4609636984 -0.1696280253 -5.605492e-02
taller -0.4609636984 0.4473257798 -0.0535216133 2.117073e-03
sexo -0.1696280253 -0.0535216133 0.5072623436 1.228218e-02
edad -0.0560549171 0.0021170730 0.0122821769 2.208178e-03
antg 0.0686105842 -0.0008662038 -0.0163953239 -2.692865e-03
sala -0.0006247407 0.0000588528 -0.0002178462 -6.131445e-07
antg sala
(Intercept) 6.861058e-02 -6.247407e-04
taller -8.662038e-04 5.885280e-05
sexo -1.639532e-02 -2.178462e-04
edad -2.692865e-03 -6.131445e-07
antg 4.261912e-03 -7.447761e-06
sala -7.447761e-06 5.399159e-07
# Stepwise Regression
library(MASS)
modelo2=lm(ausen ~ taller + sexo + edad + antg + sala , data=data)
step=stepAIC(modelo2, direction="both")
Start: AIC=70.37
ausen ~ taller + sexo + edad + antg + sala
Df Sum of Sq RSS AIC
- edad 1 2.418 164.37 69.084
<none> 161.95 70.372
- taller 1 8.085 170.03 70.711
- antg 1 20.566 182.52 74.111
- sexo 1 31.923 193.87 77.008
- sala 1 140.080 302.03 98.288
Step: AIC=69.08
ausen ~ taller + sexo + antg + sala
Df Sum of Sq RSS AIC
<none> 164.37 69.084
- taller 1 8.731 173.10 69.568
+ edad 1 2.418 161.95 70.372
- sexo 1 44.720 209.09 78.635
- sala 1 140.779 305.15 96.781
- antg 1 151.697 316.07 98.468
step$anova # display results
Stepwise Model Path
Analysis of Deviance Table
Initial Model:
ausen ~ taller + sexo + edad + antg + sala
Final Model:
ausen ~ taller + sexo + antg + sala
Step Df Deviance Resid. Df Resid. Dev AIC
1 42 161.9505 70.37230
2 - edad 1 2.417928 43 164.3684 69.08364
modelo3=lm(ausen ~ taller + sexo + antg + sala , data=data)
summary(modelo3)
Call:
lm(formula = ausen ~ taller + sexo + antg + sala, data = data)
Residuals:
Min 1Q Median 3Q Max
-6.989 -0.597 0.310 1.041 3.826
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.4989988 1.1211859 10.256 3.98e-13 ***
taller 1.0041358 0.6644050 1.511 0.13802
sexo 2.2562643 0.6596516 3.420 0.00138 **
antg -0.1961488 0.0311366 -6.300 1.34e-07 ***
sala -0.0044391 0.0007315 -6.069 2.91e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.955 on 43 degrees of freedom
Multiple R-squared: 0.7561, Adjusted R-squared: 0.7334
F-statistic: 33.33 on 4 and 43 DF, p-value: 1.15e-12
modelo4=lm(ausen ~ sexo + antg + sala , data=data)
summary(modelo4)
Call:
lm(formula = ausen ~ sexo + antg + sala, data = data)
Residuals:
Min 1Q Median 3Q Max
-6.8757 -0.9888 0.2701 1.3332 4.0126
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.4172771 0.9559277 12.990 < 2e-16 ***
sexo 2.4035082 0.6618691 3.631 0.000732 ***
antg -0.2000174 0.0314808 -6.354 1.02e-07 ***
sala -0.0045732 0.0007366 -6.208 1.67e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.983 on 44 degrees of freedom
Multiple R-squared: 0.7432, Adjusted R-squared: 0.7257
F-statistic: 42.44 on 3 and 44 DF, p-value: 4.805e-13
uhat=modelo4$residuals
#-----------------------------------------------------------------
# Examen de normalidad de errores
shapiro.test(uhat)
Shapiro-Wilk normality test
data: uhat
W = 0.92696, p-value = 0.005279
# Supuesto de no autocorrelacion
# install.packages("lmtest")
library(lmtest)
# Prueba de D-W - autocorrelacion
# Ho: los erreres no estan autocorrelacionados
dwtest(modelo4)
Durbin-Watson test
data: modelo4
DW = 1.8731, p-value = 0.3097
alternative hypothesis: true autocorrelation is greater than 0
# Supuesto de homoscedasticidad
# Prueba de Goldfeld-Quandt
# Ho no existe heteroscedasticidad
gqtest(modelo4)
Goldfeld-Quandt test
data: modelo4
GQ = 0.46949, df1 = 20, df2 = 20, p-value = 0.9506
alternative hypothesis: variance increases from segment 1 to 2
# Supuesto de correcta especificacion
# Prueba de especificacion
# Prueba RESET
resettest(modelo4, power=2, type="regressor")
RESET test
data: modelo4
RESET = 1.7756, df1 = 3, df2 = 41, p-value = 0.1669
modelo5=lm(ausen ~ antg + sala , data=data)
summary(modelo5)
Call:
lm(formula = ausen ~ antg + sala, data = data)
Residuals:
Min 1Q Median 3Q Max
-5.508 -1.103 0.033 1.914 3.610
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.9557841 1.0680594 11.194 1.35e-14 ***
antg -0.1934922 0.0354307 -5.461 1.95e-06 ***
sala -0.0034216 0.0007495 -4.565 3.85e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.236 on 45 degrees of freedom
Multiple R-squared: 0.6662, Adjusted R-squared: 0.6514
F-statistic: 44.91 on 2 and 45 DF, p-value: 1.898e-11