# Breusch-Pagan test for heteroskedasticity

The program below performs the Breusch-Pagan test step-by-step, then uses bptest from the lmtest package.

In order to use the built-in heteroskedasticity tests in R, you’ll have to install the lmtest package.

install.packages("lmtest")


## References

Problems, examples, and data from Wooldridge, Jeffrey M. Introductory Econometrics: A Modern Approach, 4th ed. South-Western Pub. 2009.

## Listing of the R program.

#
#  30 Oct 2012 D.S.Dixon
#
# Reproduces Example 8.4 in
#
#  Wooldridge, Jeffrey M. Introductory Econometrics: A Modern Approach. 4th ed. South-Western Pub. 2009.
#

# HPRICE1.DES
#
# price     assess    bdrms     lotsize   sqrft     colonial  lprice    lassess
# llotsize  lsqrft
#
#   Obs:    88
#
#   1. price                    house price, $1000s # 2. assess assessed value,$1000s
#   3. bdrms                    number of bedrooms
#   4. lotsize                  size of lot in square feet
#   5. sqrft                    size of house in square feet
#   6. colonial                 =1 if home is colonial style
#   7. lprice                   log(price)
#   8. lassess                  log(assess
#   9. llotsize                 log(lotsize)
#  10. lsqrft                   log(sqrft)

library(car)
library(lmtest)

formula0 <- mydata$lotsize + mydata$sqrft + mydata$bdrms myfit0 <- lm(price ~ lotsize + sqrft + bdrms, data=mydata) output0 <- summary(myfit0) print(output0) # regress the residuals squared against the same model myfit1 <- lm(I(myfit0$residuals^2) ~ lotsize + sqrft + bdrms, data=mydata)
output1 <- summary(myfit1)
print(output1)

# H0: u^2 does not depend on any parameter (e.g. all coefficients are statistically zero)
# This would be the F-test if it's homoskedastistic (which we don't know 'till after we
#   test it) so we use the LM test where LM ~ chi-squared(df = number of parameters in the model)
Rsq <- output1$r.squared N <- length(output0$residuals)
df <- length(myfit0$coefficients) - 1 LM <- N * Rsq # P [X > x] pvalue <- pchisq(LM, df, lower.tail = FALSE) cat("LM = ", LM, "\n") cat("df = ", df, "\n") cat("p = ", pvalue, "\n") # now use the canned version of Breusch-Pagan from the lmtest package print(bptest(myfit0))  # Heteroskedasticity-robust Standard Errors The program below calculates the heteroskedasticity-robust standard errors. In order to use the built-in heteroskedasticity tests in R, you’ll have to install the lmtest package. install.packages("lmtest")  Then, assuming you saved regression results to myfit0, to run the Breusch-Pagan test, library(lmtest) bptest(myfit0)  ## References Problems, examples, and data from Wooldridge, Jeffrey M. Introductory Econometrics: A Modern Approach, 4th ed. South-Western Pub. 2009. ## Listing of the R program. # # 31 Oct 2012 D.S.Dixon # # Reproduces Example 8.2 in # # Wooldridge, Jeffrey M. Introductory Econometrics: A Modern Approach. 4th ed. South-Western Pub. 2009. # library(car) mydata <- read.table("GPA3.csv", sep=",", header = TRUE, na.strings = ".") cleandata <- na.omit(mydata[mydata$spring == 1,c("cumgpa","sat","hsperc","tothrs","female","black","white")])

myfit0 <- lm(cumgpa~sat + hsperc + tothrs + female + black + white, data=cleandata)
output0 <- summary(myfit0)
print(output0)

# print the heteroskedasticity-robust standard errors
print(sqrt(diag(hccm(myfit0, type = "hc0"))))


# Cost Function

The first part of the program below is an example of regressing data, plotting the data points and the fit,
and showing a few interesting points on the curve.

The second part of the program is an example of computing the ATC, AVC and MC curves and plotting those.

Note that MC could be computed discretely if the first derivative is not known. You would do this by calculating
the differences and plotting them against the midpoints.

mc <- TC[2:length(TC)] - TC[1:length(TC)-1]
dQ <- 0.5 + Q[1:length(Q) - 1]
plot(dQ, mc)


## Listing of the R program.

#
#  28 Oct 2012 D.S.Dixon
#

library(car)
source("RegReportLibrary.R")

Q  <- c(1:10)
TC <- c(193,226,240,244,257,260,274,297,350,420)

myfit0 <- lm(TC~Q+I(Q^2)+I(Q^3))
output0 <- summary(myfit0)
print(output0)

B0 <- myfit0$coefficients B1 <- myfit0$coefficients
B2 <- myfit0$coefficients B3 <- myfit0$coefficients

inflex.x <- - B2 / (3 * B3)
inflex.y <- predict(myfit0, newdata=data.frame(Q=inflex.x))
inflex.slope = B1 + 2 * B2 * inflex.x - 3 * B3 * inflex.x^2

cat("inflection at Q =",inflex.x,", TC =", inflex.y,"\n")

delta <- 0.01 * inflex.y
QA = - (B2 + delta) / (3 * B3)
QB = - (B2 - delta) / (3 * B3)

cat("QA = ",QA,", QB = ",QB,"\n")

png("ScatterPlusFit.png")
plot(Q, TC)
lines(Q,predict(myfit0))
points(c(inflex.x,inflex.x),c(inflex.y - 10,inflex.y + 10), pch=c(2,6))
lines(c(inflex.x,inflex.x),c(inflex.y - 10,inflex.y + 10))
points(c(QA,QB),predict(myfit0, data.frame(Q=c(QA,QB))), pch=4)
legend(1,410,c("data","inflection","constant MC"),pch=c(1,2,4))
dev.off()

PQ <- predict(myfit0)
atc <- PQ / Q
avc <- (PQ - B0) / Q
# mc <- PQ[2:length(PQ)] - PQ[1:(length(PQ) - 1)]
# dQ <- .5 + PQ[1:(length(PQ) - 1)]
mc <- B1 + 2 * B2 * Q + 3 * B3 * Q^2

png("CostFunctions.png")
plot(Q,atc,type="l",ylab="$",ylim=c(0,max(atc)),lty=1) lines(Q,avc,lty=2) # lines(dQ,mc,lty=3) lines(Q,mc,lty=3) legend(8,180,c("ATC","AVC","MC"),lty=c(1,2,3)) dev.off()  # Econometric equations in WordPress Here’s a great way to get econometrics equations into your blog posts – you create a latex array environment and put each coefficient at the left of a new column. That way, you can get the standard errors to line up in the row below. It ends up looking like this: $\begin{array}{llllll} \widehat{log(wage)} = & 5.65\ + & 0.47\ educ\ + & .00078\ educ * pareduc\ + & .019\ exper\ + & .010\ tenure \\ & (.13) & (.010) & (.00021) & (.004) & (.003) \\ \multicolumn{3}{l}{n = 722, R^{2} = .169} \end{array}$ # More on creating interaction dummies Here’s the dummies created all in one line (plus a line to add the column labels) dummies <- 1*outer(apply(data.matrix(cleandata[,c("female","athlete")]) * c(2,1),1,sum),c(3:0),"==") colnames(dummies)<-c("female_athlete","female_nonathlete","male_athlete","male_nonathlete")  # Interaction dummies Here’s a couple of quick and clean ways to add new interaction dummy variables to your data matrix so you can regress them the same way as other columns in the matrix # make a copy of your data, in case you make a mistake newdata <- cleandata # create new columns with the new dummy variables newdata[,"female_athlete"] <- 1*(cleandata$female == 1 & cleandata$athlete == 1) newdata[,"female_nonathlete"] <- 1*(cleandata$female == 1 & cleandata$athlete == 0) newdata[,"male_athlete"] <- 1*(cleandata$female == 0 & cleandata$athlete == 1) newdata[,"male_nonathlete"] <- 1*(cleandata$female == 0 & cleandata$athlete == 0) # outer is a little more exotic: it gets the outer equals ("==") product of female and the vector(1,1,0) # and ANDs (&) it with the outer equals product of athlete and the vector(1,0,1) mat <- 1*(outer(cleandata$female, c(1,1,0), "==") & outer(cleandata$athlete, c(1,0,1), "==")) colnames(mat) <- c("female_athlete", "female_nonathlete", "male_athlete") newdata <- cbind(cleandata, mat) # this does the same thing, but for all four interaction dummies mat <- 1*(outer(cleandata$female, c(1,1,0,0), "==") & outer(cleandata$athlete, c(1,0,1,0), "==")) colnames(mat) <- c("female_athlete", "female_nonathlete", "male_athlete", "male_nonathlete") newdata <- cbind(cleandata, mat)  # Homework 9 # C5.2 Use the data in GPS2.RAW for this exercise. (i) Using all 4,137 observationsl, estimate the equation $colgpa = \beta_{0} + \beta_{1} hsperc + \beta_{2} sat + u$ and report theresults in the standard form. The regression results with the full data set are $\begin{array}{llll} \widehat{colgpa} = & 1.392 & -0.01352\ hsperc & +0.001476\ sat\\ & (0.0715) & (0.000549) & (6.531) \\ \multicolumn{4}{l}{n = 4137, R^{2} = 0.273, Adjusted R^{2} = 0.273}\\ \end{array}$ (ii) Reestimate the equation in part (i), using the first 2,070 observations. The regression results with the first 2070 points is $\begin{array}{llll} \widehat{colgpa} = & 1.436 & -0.01275\ hsperc & +0.001468\ sat\\ & (0.0978) & (0.000719) & (8.858) \\ \multicolumn{4}{l}{n = 2070, R^{2} = 0.283, Adjusted R^{2} = 0.282}\\ \end{array}$ (iii) Find the ratio of the standard errors on hsperc from parts (i) and (ii). Compare this with the results from (5.10). The ratio of the standard errors is $0.000549/0.000719 = 0.764$ Equation (5.10) $Se(\widehat{\beta_{j}}) \approx c_{j}/\sqrt{n}$ where $c_{j}$ is a positive constant that should be approximately the same for any n. That is, $Se(\widehat{\beta_{0}}) \sqrt{n_{0}} \approx Se(\widehat{\beta_{1}}) \sqrt{n_{1}}$ so that $Se(\widehat{\beta_{0}}) / Se(\widehat{\beta_{1}}) \approx \sqrt{n_{1}} / \sqrt{n_{0}}$ For parts (i) and (ii) $\sqrt{2070} / \sqrt{4137} = 0.707$ This is within about 7.5% of the ratio of the standard errors. For the full dataset, the constant $c_{j}$ is 0.0353. To examine convergence, compare this value for $c_{j}$ with values from subsets of different sizes. That is, regress this model on two sets of 2068 (half the dataset), then on three sets of 1379, four sets of 1034, five sets of a 827, and ten sets of 413. From each regression, multiply the standard error for $\beta_{1}$ by the square root of the number of samples and plot these against the number of samples. The plot is shown here, and the sample R code is shown below. ## References Problem and data from Wooldridge Introductory Econometrics: A Modern Approach, 4e. ## Listing of the R program. # # 10 Oct 2012 D.S.Dixon # # GPA2.DES # # sat tothrs colgpa athlete verbmath hsize hsrank hsperc # female white black hsizesq # # Obs: 4137 # # 1. sat combined SAT score # 2. tothrs total hours through fall semest # 3. colgpa GPA after fall semester # 4. athlete =1 if athlete # 5. verbmath verbal/math SAT score # 6. hsize size graduating class, 100s # 7. hsrank rank in graduating class # 8. hsperc high school percentile, from top # 9. female =1 if female # 10. white =1 if white # 11. black =1 if black # 12. hsizesq hsize^2 # source("RegReportLibrary.R") mydata <- read.table("gpa2.csv", sep=",", header = TRUE, na.strings = ".") myfit0 <- lm(colgpa~hsperc + sat, data=mydata) output0 <- summary(myfit0) print(output0) wordpressFormat(myfit0) nfull <- length(output0$residuals)
sefull <- output0$coefficients[2,2] myfit1 <- lm(colgpa~hsperc + sat, data=mydata[1:2070,]) output1 <- summary(myfit1) print(output1) wordpressFormat(myfit1) separt <- output1$coefficients[2,2]
npart <- length(output1$residuals) cat("ratio of standard errors: ",(sefull/separt),"\n") cat("ratio of square root observations ",((npart/nfull)^0.5),"\n") N <- length(mydata$colgpa)

N1 <- as.integer(N/10)
N2 <- as.integer(N/5)
N3 <- as.integer(N/4)
N4 <- as.integer(N/3)
N5 <- as.integer(N/2)

mat <- matrix(nrow=25,ncol=2)
row <- 1

sqrtn <- sqrt(N1)
for(i in 0:9){
start <- N1 * i
end <- N1 * (i + 1) - 1
output1 <- summary(lm(colgpa~hsperc + sat, data=mydata[start:end,]))
mat[row,] <- c(N1,output1$coefficients[2,2]*sqrtn) row <- row + 1 } sqrtn <- sqrt(N2) for(i in 0:4){ start <- N2 * i end <- N2 * (i + 1) - 1 output1 <- summary(lm(colgpa~hsperc + sat, data=mydata[start:end,])) mat[row,] <- c(N2,output1$coefficients[2,2]*sqrtn)
row <- row + 1
}

sqrtn <- sqrt(N3)
for(i in 0:3){
start <- N3 * i
end <- N3 * (i + 1) - 1
output1 <- summary(lm(colgpa~hsperc + sat, data=mydata[start:end,]))
mat[row,] <- c(N3,output1$coefficients[2,2]*sqrtn) row <- row + 1 } sqrtn <- sqrt(N4) for(i in 0:2){ start <- N4 * i end <- N4 * (i + 1) - 1 output1 <- summary(lm(colgpa~hsperc + sat, data=mydata[start:end,])) mat[row,] <- c(N4,output1$coefficients[2,2]*sqrtn)
row <- row + 1
}

sqrtn <- sqrt(N5)
for(i in 0:1){
start <- N5 * i
end <- N5 * (i + 1) - 1
output1 <- summary(lm(colgpa~hsperc + sat, data=mydata[start:end,]))
mat[row,] <- c(N5,output1$coefficients[2,2]*sqrtn) row <- row + 1 } sqrtn <- sqrt(N) mat[row,] <- c(N,output0$coefficients[2,2]*sqrtn)

## make a plot of constants
plot(mat, xlab="number of points per dataset", ylab="stderr")
dev.off()

cat("c = ",output0$coefficients[2,2]*sqrtn,"\n")  # Models with Interaction Terms ﻿#Interaction Terms# Suppose you have an econometric model of the form price = β0 + β1 sqrft + β2 bdrms + u where price = house price in 1000s of dollars sqrft = size of house in square feet bdrms = number of bedrooms The estimator β2 is marginal home price per bedroom. Or is it? Ceteris paribus, a very small house with two small bedrooms may be more attractive than the same size house with three tiny bedrooms. In that case, the marginal price would be negative. The regression results are Residuals: Min 1Q Median 3Q Max -127.627 -42.876 -7.051 32.589 229.003 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -19.31500 31.04662 -0.622 0.536 sqrft 0.12844 0.01382 9.291 1.39e-14 *** bdrms 15.19819 9.48352 1.603 0.113 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 63.04 on 85 degrees of freedom Multiple R-squared: 0.6319, Adjusted R-squared: 0.6233 F-statistic: 72.96 on 2 and 85 DF, p-value: < 2.2e-16  To take into account that increasing numbers of bedrooms may only be a good thing for a larger house, introduce a term that is the product of house size and number of bedrooms. Now the econometric model is price = β0 + β1 sqrft + β2 bdrms + β3 sqrft * bdrms + u The regression of this model is Residuals: Min 1Q Median 3Q Max -146.858 -33.911 -8.586 27.900 212.400 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 181.69081 92.18885 1.971 0.0520 . sqrft 0.03262 0.04364 0.747 0.4569 bdrms -35.95534 24.01237 -1.497 0.1380 sqrft*bdrms 0.02345 0.01016 2.308 0.0234 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 61.5 on 84 degrees of freedom Multiple R-squared: 0.6539, Adjusted R-squared: 0.6415 F-statistic: 52.9 on 3 and 84 DF, p-value: < 2.2e-16  Now the marginal price per bedroom is marginal price per bedroom = -35.95534 + 0.02344804 * sqrft (24.01237) (0.01016) For the values of sqrft in this dataset, the marginal price per bedroom ranges from -$8,500 to $55,020, with a mean of$11,260.

Here’s a comparison of the two regressions

Model 1 Model 2
F-statistic 72.964 52.896
(Intercept) -19.315 181.69
….stderr ( 31.047 ) ( 92.189 ) 
sqrft 0.12844 0.032621
….stderr. ( 0.013824 )  ( 0.043642 )
bdrms 15.198 -35.955
….stderr . ( 9.4835 ) ( 24.012 )
sqrft*bdrms 0.023448
….stderr.. ( 0.010157 ) 

 10% significance
 5% significance
 1% significance

## Listing of the R programs.

### Program to print the regression results.

#
#  10 Oct 2012 D.S.Dixon
#

# HPRICE1.DES
#
# price     assess    bdrms     lotsize   sqrft     colonial  lprice    lassess
# llotsize  lsqrft
#
#   Obs:    88
#
#   1. price                    house price, $1000s # 2. assess assessed value,$1000s
#   3. bdrms                    number of bedrooms
#   4. lotsize                  size of lot in square feet
#   5. sqrft                    size of house in square feet
#   6. colonial                 =1 if home is colonial style
#   7. lprice                   log(price)
#   8. lassess                  log(assess
#   9. llotsize                 log(lotsize)
#  10. lsqrft                   log(sqrft)
#

print(summary(mydata))

myfit0 <- lm(price~sqrft + bdrms, data=mydata)
output0 <- summary(myfit0)
print(output0)

sqftrooms <- mydata$sqrft * mydata$bdrms
myfit1 <- lm(price~sqrft + bdrms + sqftrooms, data=mydata)
names(myfit1$coefficients) <- "sqrft*bdrms" output1 <- summary(myfit1) print(output1) cat("marginal price per bedroom = ", output1$coefficients["bdrms",1], " + ", output1$coefficients["sqrft*bdrms",1], " * sqrft\n") minmargin <- output1$coefficients["bdrms",1]  + output1$coefficients["sqrft*bdrms",1] * min(mydata$sqrft)
maxmargin <- output1$coefficients["bdrms",1] + output1$coefficients["sqrft*bdrms",1] * max(mydata$sqrft) meanmargin <- output1$coefficients["bdrms",1]  + output1$coefficients["sqrft*bdrms",1] * mean(mydata$sqrft)
cat("range = ", minmargin, " to ", maxmargin, ", with a mean of ", meanmargin,"\n")


### Program to print the comparison table.

#
#  10 Oct 2012 D.S.Dixon
#
# This outputs a table in html form
#

# HPRICE1.DES
#
# price     assess    bdrms     lotsize   sqrft     colonial  lprice    lassess
# llotsize  lsqrft
#
#   Obs:    88
#
#   1. price                    house price, $1000s # 2. assess assessed value,$1000s
#   3. bdrms                    number of bedrooms
#   4. lotsize                  size of lot in square feet
#   5. sqrft                    size of house in square feet
#   6. colonial                 =1 if home is colonial style
#   7. lprice                   log(price)
#   8. lassess                  log(assess
#   9. llotsize                 log(lotsize)
#  10. lsqrft                   log(sqrft)
#

#
# In order to use this, you have to install the xtable package. You do that by starting R and typing
#
#     install.packages("xtable")
#
# or use the package tool in your GUI if you have one
#

library("xtable")

myfit0 <- lm(price~sqrft + bdrms, data=mydata)
output0 <- summary(myfit0)

sqftrooms <- mydata$sqrft * mydata$bdrms
myfit1 <- lm(price~sqrft + bdrms + sqftrooms, data=mydata)
names(myfit1$coefficients) <- "sqrft*bdrms" output1 <- summary(myfit1) mat <- matrix(nrow=10,ncol=2) mat[1,1] <- format(output0$adj.r.squared     , digits=5, width=0, justify="right")
mat[2,1] <- format(output0$fstatistic , digits=5, width=0, justify="right") mat[3,1] <- format(output0$coefficients[1,1] , digits=5, width=0, justify="right")
mat[5,1] <- format(output0$coefficients[2,1] , digits=5, width=0, justify="right") mat[7,1] <- format(output0$coefficients[3,1] , digits=5, width=0, justify="right")

sigstr <- c("","","","")[1 + (output0$coefficients[,4] <= 0.1) + (output0$coefficients[,4] <= 0.05) + (output0$coefficients[,4] <= 0.01)] mat[4,1] <- paste("(", format(output0$coefficients[1,2], digits=5, width=0, justify="right"), ") ", sigstr)
mat[6,1] <- paste("(", format(output0$coefficients[2,2], digits=5, width=0, justify="right"), ") ", sigstr) mat[8,1] <- paste("(", format(output0$coefficients[3,2], digits=5, width=0, justify="right"), ") ", sigstr)

mat[1,2] <- format(output1$adj.r.squared , digits=5, width=0, justify="right") mat[2,2] <- format(output1$fstatistic     , digits=5, width=0, justify="right")

mat[3,2] <- format(output1$coefficients[1,1] , digits=5, width=0, justify="right") mat[5,2] <- format(output1$coefficients[2,1] , digits=5, width=0, justify="right")
mat[7,2] <- format(output1$coefficients[3,1] , digits=5, width=0, justify="right") mat[9,2] <- format(output1$coefficients[4,1] , digits=5, width=0, justify="right")

sigstr <- c("","","","")[1 + (output1$coefficients[,4] <= 0.1) + (output1$coefficients[,4] <= 0.05) + (output1$coefficients[,4] <= 0.01)] mat[4 ,2] <- paste("(", format(output1$coefficients[1,2], digits=5, width=0, justify="right"), ") ", sigstr)
mat[6 ,2] <- paste("(", format(output1$coefficients[2,2], digits=5, width=0, justify="right"), ") ", sigstr) mat[8 ,2] <- paste("(", format(output1$coefficients[3,2], digits=5, width=0, justify="right"), ") ", sigstr)
mat[10,2] <- paste("(", format(output1$coefficients[4,2], digits=5, width=0, justify="right"), ") ", sigstr) colnames(mat) <- c("Model 1","Model 2") rnames <- c("Adj R-squared", "F-statistic", rownames(output1$coefficients), "....stderr",
rownames(output1$coefficients), "....stderr.", rownames(output1$coefficients), "....stderr .",
rownames(output1$coefficients), "....stderr..") rownames(mat) <- rnames newobject<-xtable(mat) print.xtable(newobject, type="html") # print(mat)  # Homework 8 # C4.2 ### (i) Using the same model as Problem 3.4, state and test the null hypothesis that the rank of law schools has no ceteris paribus effect on median starting salary. The median starting salary for new law school graduates is determined by log(salary) = β0 + β1 LSAT + β2 GPA + β3 log(libvol) + β4 log(cost) + β5 rank + u where LSAT is the median LSAT score for the graduating class, GPA is the median college GPA for the class, libvol is the number of volumes in the law school library, cost is the annual cost of atttending law school, and rank is a law school ranking (with rank = 1 being the best). The null hypothesis that rank has no effect on log(salary) is H0: β5 = 0 The alternative is H1: β5 ≠ 0 which is a two-tailed test. The regression results are:  Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.3432330 0.5325192 15.667 < 2e-16 *** LSAT 0.0046964 0.0040105 1.171 0.24373 GPA 0.2475247 0.0900370 2.749 0.00683 ** llibvol 0.0949926 0.0332544 2.857 0.00499 ** lcost 0.0375544 0.0321061 1.170 0.24426 rank -0.0033246 0.0003485 -9.541 < 2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.1124 on 130 degrees of freedom (20 observations deleted due to missingness) Multiple R-squared: 0.8417, Adjusted R-squared: 0.8356 F-statistic: 138.2 on 5 and 130 DF, p-value: < 2.2e-16  The estimated model is log(salary) = 8.34 + 0.00470 LSAT + 0.248 GPA + 0.0950 log(libvol) + (0.53) (0.0040) (0.090) (0.033) 0.0376 log(cost) – 0.00332 rank (0.032) (0.00035) n = 136, Adj. R2 = 0.8356 Note from the regression results that the t-value is -9.541 which is highly significant. The critical value at one percent significance for 120 degrees of freedom is 2.617, and clearly |t|>>2.617. Thus we reject the null hypothesis. ### (ii) Are features of the incoming class of students — namely, LSAT and GPA — individually or jointly significant for explaining salary? (Be sure to account for missing data on LSAT and GPA.) Based on the t-value in the regression results, the coefficient on LSAT is not significant but GPA is significant at the 1% level, which would dominate any joint significance. unrestricted SSR = 1.6427, df = 130 restricted SSR = 1.8942, df = 132 F(2,130) = 9.95, 1% critical value for F(2,inf) = 4.61  Thus, at the 1% level, we reject the null hypothesis that LCAT and GPA are jointly insignificant. From the R linearHypothesis test,  Linear hypothesis test Hypothesis: LSAT = 0 GPA = 0 Model 1: restricted model Model 2: lsalary ~ LSAT + GPA + llibvol + lcost + rank Res.Df RSS Df Sum of Sq F Pr(>F) 1 132 1.8942 2 130 1.6427 2 0.25151 9.9517 9.518e-05 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1  Here, at the 0.1% level, we reject the null hypothesis that LCAT and GPA are jointly insignificant. ### (iii) Test whether incoming class size (clsize) or the size of the faculty (faculty) needs to be added to this equation; carry out a single test. (Be careful to account for missing data on clsize and faculty.) To test the joint significance of these variables, first create a model that includes them. Then, do the test manually with an unrestricted regression of this model, then a restricted model that omits these two variables. For the unrestricted model unrestricted SSR = 1.5732, df = 123 restricted SSR = 1.5974, df = 125 F(2,123) = 0.9484, 10% critical value for F(2,120) = 2.35  Thus, at the 10% level, we fail to reject the null hypothesis that clsize and faculty are jointly insignificant. From the R linearHypothesis test,  Linear hypothesis test Hypothesis: clsize = 0 faculty = 0 Model 1: restricted model Model 2: lsalary ~ LSAT + GPA + llibvol + lcost + rank + clsize + faculty Res.Df RSS Df Sum of Sq F Pr(>F) 1 125 1.5974 2 123 1.5732 2 0.024259 0.9484 0.3902  Here, at the 39% level, we fail to reject the null hypothesis that clsize and faculty are jointly insignificant. ### (iv) What factors might influence the rank of the law school that are not included in the salary regression? There are salary differences based on gender and race/ethnicity across the labor market, and these may have some correlation with rank for a few law schools, but probably not over the entire data set. Individual programs are frequently ranked by the frequency and quality of publications by their faculty, so that is very likely to correlate with rank. ## References Problem and data from Wooldridge Introductory Econometrics: A Modern Approach, 4e. ## Listing of the R program. # # 1 Oct 2012 D.S.Dixon # # # LAWSCH85.DES # # rank salary cost LSAT GPA libvol faculty age # clsize north south east west lsalary studfac top10 # r11_25 r26_40 r41_60 llibvol lcost # # Obs: 156 # # 1. rank law school ranking # 2. salary median starting salary # 3. cost law school cost # 4. LSAT median LSAT score # 5. GPA median college GPA # 6. libvol no. volumes in lib., 1000s # 7. faculty no. of faculty # 8. age age of law sch., years # 9. clsize size of entering class # 10. north =1 if law sch in north # 11. south =1 if law sch in south # 12. east =1 if law sch in east # 13. west =1 if law sch in west # 14. lsalary log(salary) # 15. studfac student-faculty ratio # 16. top10 =1 if ranked in top 10 # 17. r11_25 =1 if ranked 11-25 # 18. r26_40 =1 if ranked 26-40 # 19. r41_60 =1 if ranked 41-60 # 20. llibvol log(libvol) # 21. lcost log(cost) # mydata <- read.table("LAWSCH85.csv", sep=",", header = TRUE, na.strings = ".", ) print(summary(mydata)) # eliminate missing data in LSAT and GPA cleandata<-mydata[!is.na(mydata$LSAT) & !is.na(mydata$GPA),] # unrestricted model myfit0<-lm(lsalary~LSAT+GPA+llibvol+lcost+rank, data=cleandata) fitsum0 <- summary(myfit0) SSR0 <- deviance(myfit0) df0 <- df.residual(myfit0) print(fitsum0) print(paste("n = ",length(myfit0$residuals)))

# restricted model (LSAT=0, GPA=0)
myfit0Rest<-lm(lsalary~llibvol+lcost+rank, data=cleandata)
fitsum0Rest <- summary(myfit0Rest)

SSR0Rest <- deviance(myfit0Rest)
df0Rest <- df.residual(myfit0Rest)

print(fitsum0Rest)
print(paste("n = ",length(myfit0Rest$residuals))) print(paste("SSR0 = ",SSR0)) print(paste("SSR0Rest = ",SSR0Rest)) q <- df0Rest - df0 nk1 <- df0 print(paste("df0 = ",df0)) print(paste("df0Rest = ",df0Rest)) print(paste("q = ",q)) print(paste("nk1 = ",nk1)) F0 <- ((SSR0Rest - SSR0)/q)/(SSR0/nk1) print("old school") print(paste("F = ",F0)) library(car) # joint significance of LSAT and GPA hypmatrix <- rbind(c(0,1,0,0,0,0),c(0,0,1,0,0,0)) rhs <- c(0,0) myfit<-lm(lsalary~LSAT+GPA+llibvol+lcost+rank, data=mydata) hyp <- linearHypothesis(myfit, hypmatrix, rhs) print("new school") print(hyp) # now eliminate missing data from clsize and faculty cleanerdata<-cleandata[!is.na(cleandata$clsize) & !is.na(cleandata$faculty),] print(summary(cleanerdata)) # unrestricted model myfit1<-lm(lsalary~LSAT+GPA+llibvol+lcost+rank+clsize+faculty, data=cleanerdata) fitsum1 <- summary(myfit1) SSR1 <- deviance(myfit1) df1 <- df.residual(myfit1) # restricted model (clsize=0, faculty=0) myfit1Rest<-lm(lsalary~LSAT+GPA+llibvol+lcost+rank, data=cleanerdata) fitsum1Rest <- summary(myfit1Rest) SSR1Rest <- deviance(myfit1Rest) df1Rest <- df.residual(myfit1Rest) print("Testing clsize and faculty") print(paste("n = ",length(myfit1Rest$residuals)))

print(paste("SSR1 = ",SSR1))
print(paste("SSR1Rest = ",SSR1Rest))
q <- df1Rest - df1
nk1 <- df1
print(paste("df1 = ",df1))
print(paste("df1Rest = ",df1Rest))
print(paste("q = ",q))
print(paste("nk1 = ",nk1))
F1 <- ((SSR1Rest - SSR1)/q)/(SSR1/nk1)
print("old school")
print(paste("F = ",F1))

library(car)

# joint significance of LSAT and GPA
hypmatrix <- rbind(c(0,0,0,0,0,0,1,0),c(0,0,0,0,0,0,0,1))
rhs <- c(0,0)

myfit<-lm(lsalary~LSAT+GPA+llibvol+lcost+rank+clsize+faculty, data=mydata)
hyp <- linearHypothesis(myfit, hypmatrix, rhs)

print("new school")
print(hyp)


# An Example of Hypothesis Testing Using R

## Installing the car library

If you haven’t done it already, you’ll have to install the car library, which includes, among many other useful things, the linearHypothesis command.

Start R and type

install.packages('car',.Library)


It asks you to select a mirror (server) from which to download. Doesn’t much matter, I suppose, but I picked Tennessee.

Once you do this, you never have to do it again, on the same computer, anyway.

## The example econometric model

The model is from the textbook, from section 4.4, starting on page 140.

log(wage) = β0 + β1 jc + β2 univ + β3 exper + u

where

jc = number of years attending a tyo-year college.
univ = number of years at a four-year college.
exper = months in the workforce.


The null hypothesis is that, in terms of impact on wages, a year at a two-year college is equivalent to a year at a four-year college. That is

H0: β1 = β2

The alternative is that a year at a two-year college earns a lower wage than a year at a four-year college:

H1: β1 < β2

## Setting up the hypothesis test

### Getting to know the data

Here is a summary of the data

> print(summary(mydata))
female          phsrank            BA               AA
Min.   :0.0000   Min.   : 0.00   Min.   :0.0000   Min.   :0.00000
1st Qu.:0.0000   1st Qu.:44.00   1st Qu.:0.0000   1st Qu.:0.00000
Median :1.0000   Median :50.00   Median :0.0000   Median :0.00000
Mean   :0.5196   Mean   :56.16   Mean   :0.3065   Mean   :0.04406
3rd Qu.:1.0000   3rd Qu.:76.00   3rd Qu.:1.0000   3rd Qu.:0.00000
Max.   :1.0000   Max.   :99.00   Max.   :1.0000   Max.   :1.00000
black            hispanic             id            exper
Min.   :0.00000   Min.   :0.00000   Min.   :   19   Min.   :  3.0
1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:19372   1st Qu.:104.0
Median :0.00000   Median :0.00000   Median :39301   Median :129.0
Mean   :0.09508   Mean   :0.04687   Mean   :40616   Mean   :122.4
3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:58842   3rd Qu.:149.0
Max.   :1.00000   Max.   :1.00000   Max.   :89958   Max.   :166.0
jc              univ           lwage            stotal
Min.   :0.0000   Min.   :0.000   Min.   :0.5555   Min.   :-3.32480
1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.9253   1st Qu.:-0.32734
Median :0.0000   Median :0.200   Median :2.2763   Median : 0.00000
Mean   :0.3389   Mean   :1.926   Mean   :2.2481   Mean   : 0.04748
3rd Qu.:0.0000   3rd Qu.:4.200   3rd Qu.:2.5969   3rd Qu.: 0.61079
Max.   :3.8333   Max.   :7.500   Max.   :3.9120   Max.   : 2.23537
smcity          medcity           submed            lgcity
Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000
1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000
Median :0.0000   Median :0.0000   Median :0.00000   Median :0.00000
Mean   :0.2854   Mean   :0.1174   Mean   :0.06861   Mean   :0.09448
3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000
Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000
sublg            vlgcity            subvlg              ne
Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000
1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000
Median :0.00000   Median :0.00000   Median :0.00000   Median :0.0000
Mean   :0.08709   Mean   :0.05855   Mean   :0.06358   Mean   :0.2107
3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000
Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000
nc             south           totcoll
Min.   :0.0000   Min.   :0.0000   Min.   : 0.000
1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 0.000
Median :0.0000   Median :0.0000   Median : 1.507
Mean   :0.2988   Mean   :0.3271   Mean   : 2.265
3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 4.367
Max.   :1.0000   Max.   :1.0000   Max.   :10.067


The diligent student will also do histograms of the variables of interest.

### Getting to know the model

Here is a the regression model

> myfit <- lm(lwage~jc+univ+exper,data=mydata)


Here is a summary of the results from the regression

> print(summary(myfit))

Call:
lm(formula = lwage ~ jc + univ + exper, data = mydata)

Residuals:
Min       1Q   Median       3Q      Max
-2.10362 -0.28132  0.00551  0.28518  1.78167

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.4723256  0.0210602  69.910   <2e-16 ***
jc          0.0666967  0.0068288   9.767   <2e-16 ***
univ        0.0768763  0.0023087  33.298   <2e-16 ***
exper       0.0049442  0.0001575  31.397   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4301 on 6759 degrees of freedom
Multiple R-squared: 0.2224,	Adjusted R-squared: 0.2221
F-statistic: 644.5 on 3 and 6759 DF,  p-value: < 2.2e-16


### Creating a new variable

The book gives an example of using a t-test on a created variable

θ = β1 – β2

so that the hypotheses are

H0: θ = 0
H1: θ < 0

This is an instructive exercise, and I encourage students to try it. I may include it in this blog post sometime, but for now, I’ll skip over it.

### Setting up linearHypothesis

The general matrix form for a hypothesis test is

M β = Y

where

M = n x k matrix, n = number of hypotheses, k = number of estimators
β = k-length vector of estimators
Y = n-length vector of constraint values

For the null hypothesis

H0: β1 – β2 = 0

the hypothesis matrices are

M = [0, 1, -1, 0, 0]
β = [β0, β1, β2, β3]
Y = 

> hypomatrix <- c(0,1,-1,0)
> hypotest <- linearHypothesis(myfit,hypomatrix,0)
> print(hypotest)
Linear hypothesis test

Hypothesis:
jc - univ = 0

Model 1: restricted model
Model 2: lwage ~ jc + univ + exper

Res.Df    RSS Df Sum of Sq     F Pr(>F)
1   6760 1250.9
2   6759 1250.5  1   0.39853 2.154 0.1422


With F(1,6759) = 2.154, we fail to reject the null hypothesis at the 15% level. That is, a year at a two-year college is equivalent to a year at a four-year college as far as wage is concerned.

### Listing of the R program used to answer these questions.

#
#  28 Aug 2012 D.S.Dixon
#
# This is the dataset for the first EC 460 homework assignment
#

# from TWOYEAR.DES
#
# female    phsrank   BA        AA        black     hispanic  id        exper
# jc        univ      lwage     stotal    smcity    medcity   submed    lgcity
# sublg     vlgcity   subvlg    ne        nc        south     totcoll
#
#   Obs:      6763
#
#  1. female                            =1 if female
#  2. phsrank                           % high school rank; 100 = best
#  3. BA                                =1 if Bachelor's degree
#  4. AA                                =1 if Associate's degree
#  5. black                             =1 if African-American
#  6. hispanic                          =1 if Hispanic
#  7. id                                ID Number
#  8. exper                             total (actual) work experience
#  9. jc                                total 2-year credits
# 10. univ                              total 4-year credits
# 11. lwage                             log hourly wage
# 12. stotal                            total standardized test score
# 13. smcity                            =1 if small city, 1972
# 14. medcity                           =1 if med. city, 1972
# 15. submed                            =1 if suburb med. city, 1972
# 16. lgcity                            =1 if large city, 1972
# 17. sublg                             =1 if suburb large city, 1972
# 18. vlgcity                           =1 if very large city, 1972
# 19. subvlg                            =1 if sub. very lge. city, 1972
# 20. ne                                =1 if northeast
# 21. nc                                =1 if north central
# 22. south                             =1 if south
# 23. totcoll                           jc + univ
#

# fetch the library with linearyHypothesis
library(car)

# summarize the data
### @export "data-summary"
print(summary(mydata))
### @end

# run the regression model
### @export "regression-model"
myfit <- lm(lwage~jc+univ+exper,data=mydata)
### @end

# summarize the regression
### @export "regression-summary"
print(summary(myfit))
### @end

# summarize the regression
### @export "hypothesis-test"
hypomatrix <- c(0,1,-1,0)
hypotest <- linearHypothesis(myfit,hypomatrix,0)
print(hypotest)
### @end