Regresión Lineal Múltiple

library(readr)
data=read_delim("data/ausentismo.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)

cov(data)
                 id         ausen       taller         sexo         edad
id      196.0000000   -17.5957447   1.08510638   0.94680851   39.2021277
ausen   -17.5957447    14.3404255   0.42553191  -0.20212766  -34.8404255
taller    1.0851064     0.4255319   0.19148936   0.01595745   -0.8882979
sexo      0.9468085    -0.2021277   0.01595745   0.25132979    0.2380319
edad     39.2021277   -34.8404255  -0.88829787   0.23803191  185.6130319
antg     18.2127660   -27.7446809  -0.50000000   1.32712766  121.6515957
sala   3327.0212766 -1223.1914894 -22.34042553 118.24468085 2081.7553191
              antg         sala
id       18.212766   3327.02128
ausen   -27.744681  -1223.19149
taller   -0.500000    -22.34043
sexo      1.327128    118.24468
edad    121.651596   2081.75532
antg    104.920213   2175.42553
sala   2175.425532 234470.21277
cor(data)
               id      ausen     taller       sexo       edad       antg
id      1.0000000 -0.3318934  0.1771218  0.1349001  0.2055310  0.1270043
ausen  -0.3318934  1.0000000  0.2567904 -0.1064689 -0.6753023 -0.7152686
taller  0.1771218  0.2567904  1.0000000  0.0727393 -0.1489986 -0.1115496
sexo    0.1349001 -0.1064689  0.0727393  1.0000000  0.0348505  0.2584408
edad    0.2055310 -0.6753023 -0.1489986  0.0348505  1.0000000  0.8717340
antg    0.1270043 -0.7152686 -0.1115496  0.2584408  0.8717340  1.0000000
sala    0.4907764 -0.6670672 -0.1054327  0.4870974  0.3155598  0.4386022
             sala
id      0.4907764
ausen  -0.6670672
taller -0.1054327
sexo    0.4870974
edad    0.3155598
antg    0.4386022
sala    1.0000000
# Estimacion del modelo
attach(data)
modelo1=lm(ausen ~ taller + sexo + edad + antg + sala , data=data)
summary(modelo1) 

Call:
lm(formula = ausen ~ taller + sexo + edad + antg + sala, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.0713 -0.5383  0.3031  0.9391  3.5793 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.4436075  1.6404323   7.586 2.14e-09 ***
taller       0.9684600  0.6688242   1.448  0.15504    
sexo         2.0492914  0.7122235   2.877  0.00628 ** 
edad        -0.0372111  0.0469913  -0.792  0.43288    
antg        -0.1507700  0.0652833  -2.309  0.02590 *  
sala        -0.0044288  0.0007348  -6.027 3.63e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.964 on 42 degrees of freedom
Multiple R-squared:  0.7597,    Adjusted R-squared:  0.7311 
F-statistic: 26.56 on 5 and 42 DF,  p-value: 5.282e-12
# diagnostico
coefficients(modelo1) # coeficientes estimados
 (Intercept)       taller         sexo         edad         antg         sala 
12.443607478  0.968459990  2.049291411 -0.037211075 -0.150770045 -0.004428793 
yhat=fitted(modelo1) # valores estimados
u=residuals(modelo1) # residuales
anova(modelo1) # tabla de anova
Analysis of Variance Table

Response: ausen
          Df  Sum Sq Mean Sq F value    Pr(>F)    
taller     1  44.444  44.444 11.5262  0.001510 ** 
sexo       1  10.612  10.612  2.7522  0.104573    
edad       1 275.299 275.299 71.3956 1.331e-10 ***
antg       1  41.613  41.613 10.7919  0.002062 ** 
sala       1 140.080 140.080 36.3283 3.629e-07 ***
Residuals 42 161.950   3.856                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(modelo1) # matriz de varianzas covarianza de parametros 
              (Intercept)        taller          sexo          edad
(Intercept)  2.6910180499 -0.4609636984 -0.1696280253 -5.605492e-02
taller      -0.4609636984  0.4473257798 -0.0535216133  2.117073e-03
sexo        -0.1696280253 -0.0535216133  0.5072623436  1.228218e-02
edad        -0.0560549171  0.0021170730  0.0122821769  2.208178e-03
antg         0.0686105842 -0.0008662038 -0.0163953239 -2.692865e-03
sala        -0.0006247407  0.0000588528 -0.0002178462 -6.131445e-07
                     antg          sala
(Intercept)  6.861058e-02 -6.247407e-04
taller      -8.662038e-04  5.885280e-05
sexo        -1.639532e-02 -2.178462e-04
edad        -2.692865e-03 -6.131445e-07
antg         4.261912e-03 -7.447761e-06
sala        -7.447761e-06  5.399159e-07
# Stepwise Regression
library(MASS)
modelo2=lm(ausen ~ taller + sexo + edad + antg + sala , data=data)
step=stepAIC(modelo2, direction="both")
Start:  AIC=70.37
ausen ~ taller + sexo + edad + antg + sala

         Df Sum of Sq    RSS    AIC
- edad    1     2.418 164.37 69.084
<none>                161.95 70.372
- taller  1     8.085 170.03 70.711
- antg    1    20.566 182.52 74.111
- sexo    1    31.923 193.87 77.008
- sala    1   140.080 302.03 98.288

Step:  AIC=69.08
ausen ~ taller + sexo + antg + sala

         Df Sum of Sq    RSS    AIC
<none>                164.37 69.084
- taller  1     8.731 173.10 69.568
+ edad    1     2.418 161.95 70.372
- sexo    1    44.720 209.09 78.635
- sala    1   140.779 305.15 96.781
- antg    1   151.697 316.07 98.468
step$anova # display results
Stepwise Model Path 
Analysis of Deviance Table

Initial Model:
ausen ~ taller + sexo + edad + antg + sala

Final Model:
ausen ~ taller + sexo + antg + sala

    Step Df Deviance Resid. Df Resid. Dev      AIC
1                           42   161.9505 70.37230
2 - edad  1 2.417928        43   164.3684 69.08364
modelo3=lm(ausen ~ taller + sexo + antg + sala , data=data)
summary(modelo3)

Call:
lm(formula = ausen ~ taller + sexo + antg + sala, data = data)

Residuals:
   Min     1Q Median     3Q    Max 
-6.989 -0.597  0.310  1.041  3.826 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.4989988  1.1211859  10.256 3.98e-13 ***
taller       1.0041358  0.6644050   1.511  0.13802    
sexo         2.2562643  0.6596516   3.420  0.00138 ** 
antg        -0.1961488  0.0311366  -6.300 1.34e-07 ***
sala        -0.0044391  0.0007315  -6.069 2.91e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.955 on 43 degrees of freedom
Multiple R-squared:  0.7561,    Adjusted R-squared:  0.7334 
F-statistic: 33.33 on 4 and 43 DF,  p-value: 1.15e-12
modelo4=lm(ausen ~ sexo + antg + sala , data=data)
summary(modelo4)

Call:
lm(formula = ausen ~ sexo + antg + sala, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.8757 -0.9888  0.2701  1.3332  4.0126 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.4172771  0.9559277  12.990  < 2e-16 ***
sexo         2.4035082  0.6618691   3.631 0.000732 ***
antg        -0.2000174  0.0314808  -6.354 1.02e-07 ***
sala        -0.0045732  0.0007366  -6.208 1.67e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.983 on 44 degrees of freedom
Multiple R-squared:  0.7432,    Adjusted R-squared:  0.7257 
F-statistic: 42.44 on 3 and 44 DF,  p-value: 4.805e-13
uhat=modelo4$residuals
#-----------------------------------------------------------------
# Examen de normalidad de errores
shapiro.test(uhat)

    Shapiro-Wilk normality test

data:  uhat
W = 0.92696, p-value = 0.005279
# Supuesto de no autocorrelacion
# install.packages("lmtest")
library(lmtest)
# Prueba de D-W  - autocorrelacion
# Ho: los erreres no estan autocorrelacionados
dwtest(modelo4)

    Durbin-Watson test

data:  modelo4
DW = 1.8731, p-value = 0.3097
alternative hypothesis: true autocorrelation is greater than 0
# Supuesto de homoscedasticidad
# Prueba de Goldfeld-Quandt
# Ho no existe heteroscedasticidad
gqtest(modelo4)

    Goldfeld-Quandt test

data:  modelo4
GQ = 0.46949, df1 = 20, df2 = 20, p-value = 0.9506
alternative hypothesis: variance increases from segment 1 to 2
# Supuesto de correcta especificacion
# Prueba de especificacion
# Prueba RESET
resettest(modelo4, power=2, type="regressor")

    RESET test

data:  modelo4
RESET = 1.7756, df1 = 3, df2 = 41, p-value = 0.1669
modelo5=lm(ausen ~ antg + sala , data=data)
summary(modelo5)

Call:
lm(formula = ausen ~ antg + sala, data = data)

Residuals:
   Min     1Q Median     3Q    Max 
-5.508 -1.103  0.033  1.914  3.610 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.9557841  1.0680594  11.194 1.35e-14 ***
antg        -0.1934922  0.0354307  -5.461 1.95e-06 ***
sala        -0.0034216  0.0007495  -4.565 3.85e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.236 on 45 degrees of freedom
Multiple R-squared:  0.6662,    Adjusted R-squared:  0.6514 
F-statistic: 44.91 on 2 and 45 DF,  p-value: 1.898e-11