First let’s work on the Longley data in the original scale.
# clean up workspace
rm(list = ls())
# read the Longley data
all <-
read.table("http://hua-zhou.github.io/teaching/st758-2014fall/longley.dat",
col.names = c("employed", "GNPdef", "GNP", "unemployed", "armed",
"pop", "year"))
X <- cbind(1, as.matrix(all[, 2:7]))
colnames(X)[1] <- "intercept"
X
## intercept GNPdef GNP unemployed armed pop year
## [1,] 1 83.0 234289 2356 1590 107608 1947
## [2,] 1 88.5 259426 2325 1456 108632 1948
## [3,] 1 88.2 258054 3682 1616 109773 1949
## [4,] 1 89.5 284599 3351 1650 110929 1950
## [5,] 1 96.2 328975 2099 3099 112075 1951
## [6,] 1 98.1 346999 1932 3594 113270 1952
## [7,] 1 99.0 365385 1870 3547 115094 1953
## [8,] 1 100.0 363112 3578 3350 116219 1954
## [9,] 1 101.2 397469 2904 3048 117388 1955
## [10,] 1 104.6 419180 2822 2857 118734 1956
## [11,] 1 108.4 442769 2936 2798 120445 1957
## [12,] 1 110.8 444546 4681 2637 121950 1958
## [13,] 1 112.6 482704 3813 2552 123366 1959
## [14,] 1 114.2 502601 3931 2514 125368 1960
## [15,] 1 115.7 518173 4806 2572 127852 1961
## [16,] 1 116.9 554894 4007 2827 130081 1962
# we see the smallest singular value (aka trouble number) is very small
X.svd <- svd(X)
X.svd$d
## [1] 1.664e+06 8.390e+04 3.407e+03 1.583e+03 4.169e+01 3.648e+00 3.424e-04
# condition number of the design matrix
X.cond <- max(X.svd$d) / min(X.svd$d)
X.cond
## [1] 4.859e+09
# R function for obtaining condition number; exact = FALSE (default)
# is an approximation
kappa(X)
## [1] 5.618e+09
kappa(X, exact = TRUE)
## [1] 4.859e+09
# fit linear regression
beta.y <- lm(all[, 1] ~ X - 1)$coefficients
beta.y
## Xintercept XGNPdef XGNP Xunemployed Xarmed Xpop
## -3.482e+06 1.506e+01 -3.582e-02 -2.020e+00 -1.033e+00 -5.110e-02
## Xyear
## 1.829e+03
# fit linear regression with perturbed y
set.seed(2014)
ye <- 1.0e3 * rnorm(nrow(all))
beta.ye <- lm(all[, 1] + ye ~ X - 1)$coefficients
beta.ye
## Xintercept XGNPdef XGNP Xunemployed Xarmed Xpop
## -9.191e+06 -1.309e+02 -1.434e-01 -4.116e+00 -1.933e+00 -7.801e-02
## Xyear
## 4.785e+03
# relative error in y
sqrt(crossprod(ye) / crossprod(all[, 1]))
## [,1]
## [1,] 0.01314
# relative error in regression coefficients
sqrt(crossprod(beta.y - beta.ye) / crossprod(beta.y))
## [,1]
## [1,] 1.639
What happens if we center and scale predictors?
# let's center and scale the predictors except intercept
Xs <- cbind(1, scale(X[, -1], center = TRUE, scale = TRUE))
colnames(Xs)[1] <- "intercept"
Xs
## intercept GNPdef GNP unemployed armed pop year
## [1,] 1 -1.7311 -1.5434 -0.8960 -1.46093 -1.411135 -1.5753
## [2,] 1 -1.2214 -1.2905 -0.9292 -1.65348 -1.263926 -1.3653
## [3,] 1 -1.2492 -1.3043 0.5230 -1.42357 -1.099898 -1.1552
## [4,] 1 -1.1288 -1.0373 0.1687 -1.37471 -0.933713 -0.9452
## [5,] 1 -0.5079 -0.5908 -1.1711 0.70743 -0.768965 -0.7351
## [6,] 1 -0.3319 -0.4095 -1.3498 1.41872 -0.597174 -0.5251
## [7,] 1 -0.2485 -0.2245 -1.4161 1.35118 -0.334958 -0.3151
## [8,] 1 -0.1558 -0.2474 0.4117 1.06810 -0.173229 -0.1050
## [9,] 1 -0.0446 0.0983 -0.3096 0.63414 -0.005175 0.1050
## [10,] 1 0.2705 0.3167 -0.3974 0.35969 0.188324 0.3151
## [11,] 1 0.6226 0.5541 -0.2754 0.27491 0.434295 0.5251
## [12,] 1 0.8450 0.5719 1.5920 0.04356 0.650652 0.7351
## [13,] 1 1.0118 0.9558 0.6631 -0.07858 0.854214 0.9452
## [14,] 1 1.1601 1.1560 0.7894 -0.13319 1.142019 1.1552
## [15,] 1 1.2990 1.3127 1.7258 -0.04984 1.499116 1.3653
## [16,] 1 1.4102 1.6821 0.8708 0.31658 1.819554 1.5753
# condition number is much smaller
Xs.svd <- svd(Xs)
Xs.svd$d
## [1] 8.30967 4.19882 4.00000 1.74682 0.47321 0.19566 0.07517
Xs.cond <- max(Xs.svd$d) / min(Xs.svd$d)
Xs.cond
## [1] 110.5
# fit linear regression
beta.ys <- lm(all[, 1] ~ Xs - 1)$coefficients
beta.ys
## Xsintercept XsGNPdef XsGNP Xsunemployed Xsarmed
## 65317.0 162.5 -3560.2 -1887.8 -719.0
## Xspop Xsyear
## -355.5 8708.5
# fit linear regression with perturbed y
beta.yes <- lm(all[, 1] + ye ~ Xs - 1)$coefficients
beta.yes
## Xsintercept XsGNPdef XsGNP Xsunemployed Xsarmed
## 65636.3 -1412.7 -14256.4 -3846.6 -1345.5
## Xspop Xsyear
## -542.7 22783.0
# relative error in y
sqrt(crossprod(ye) / crossprod(all[, 1]))
## [,1]
## [1,] 0.01314
# relative error in regression coefficients
sqrt(crossprod(beta.ys - beta.yes) / crossprod(beta.ys))
## [,1]
## [1,] 0.2707
Some further experiments.
# form the Grammian matrix
xtx <- crossprod(X)
# qr() of the original X shows it has rull column rank
qr(X)$rank
## [1] 7
# but qr() thinks the Grammian matrix is rank deficient
qr(xtx)$rank
## [1] 5
# chol() (without symmetric pivoting) considers Grammian matrix as full rank
# and goes through without error
chol(xtx)
## intercept GNPdef GNP unemployed armed pop year
## intercept 4 406.7 1550794 12773.2 10426.8 469696.00 7818.0000
## GNPdef 0 41.8 381717 2246.2 1252.6 26379.51 18.2759
## GNP 0 0.0 49823 -311.9 -299.8 4196.92 1.7753
## unemployed 0 0.0 0 2820.6 -1644.3 3189.74 1.4530
## armed 0 0.0 0 0.0 1703.5 -46.25 0.4492
## pop 0 0.0 0 0.0 0.0 1463.20 -0.2819
## year 0 0.0 0 0.0 0.0 0.00 0.6693
# solve() thinks the Grammian matrix is singular
print(try(solve(xtx), silent = TRUE))
## [1] "Error in solve.default(xtx) : \n system is computationally singular: reciprocal condition number = 3.50566e-20\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in solve.default(xtx): system is computationally singular: reciprocal condition number = 3.50566e-20>
# clean and close
rm(list = ls())