Condition Number of Longley Data

First let’s work on the Longley data in the original scale.

# clean up workspace
rm(list = ls())

# read the Longley data
all <- 
  read.table("http://hua-zhou.github.io/teaching/st758-2014fall/longley.dat",
             col.names = c("employed", "GNPdef", "GNP", "unemployed", "armed", 
                           "pop", "year"))
X <- cbind(1, as.matrix(all[, 2:7]))
colnames(X)[1] <- "intercept"
X

##       intercept GNPdef    GNP unemployed armed    pop year
##  [1,]         1   83.0 234289       2356  1590 107608 1947
##  [2,]         1   88.5 259426       2325  1456 108632 1948
##  [3,]         1   88.2 258054       3682  1616 109773 1949
##  [4,]         1   89.5 284599       3351  1650 110929 1950
##  [5,]         1   96.2 328975       2099  3099 112075 1951
##  [6,]         1   98.1 346999       1932  3594 113270 1952
##  [7,]         1   99.0 365385       1870  3547 115094 1953
##  [8,]         1  100.0 363112       3578  3350 116219 1954
##  [9,]         1  101.2 397469       2904  3048 117388 1955
## [10,]         1  104.6 419180       2822  2857 118734 1956
## [11,]         1  108.4 442769       2936  2798 120445 1957
## [12,]         1  110.8 444546       4681  2637 121950 1958
## [13,]         1  112.6 482704       3813  2552 123366 1959
## [14,]         1  114.2 502601       3931  2514 125368 1960
## [15,]         1  115.7 518173       4806  2572 127852 1961
## [16,]         1  116.9 554894       4007  2827 130081 1962

# we see the smallest singular value (aka trouble number) is very small
X.svd <- svd(X)
X.svd$d

## [1] 1.664e+06 8.390e+04 3.407e+03 1.583e+03 4.169e+01 3.648e+00 3.424e-04

# condition number of the design matrix
X.cond <- max(X.svd$d) / min(X.svd$d)
X.cond

## [1] 4.859e+09

# R function for obtaining condition number; exact = FALSE (default)
# is an approximation
kappa(X)

## [1] 5.618e+09

kappa(X, exact = TRUE)

## [1] 4.859e+09

# fit linear regression
beta.y <- lm(all[, 1] ~ X - 1)$coefficients
beta.y

##  Xintercept     XGNPdef        XGNP Xunemployed      Xarmed        Xpop 
##  -3.482e+06   1.506e+01  -3.582e-02  -2.020e+00  -1.033e+00  -5.110e-02 
##       Xyear 
##   1.829e+03

# fit linear regression with perturbed y
set.seed(2014)
ye <- 1.0e3 * rnorm(nrow(all))
beta.ye <- lm(all[, 1] + ye ~ X - 1)$coefficients
beta.ye

##  Xintercept     XGNPdef        XGNP Xunemployed      Xarmed        Xpop 
##  -9.191e+06  -1.309e+02  -1.434e-01  -4.116e+00  -1.933e+00  -7.801e-02 
##       Xyear 
##   4.785e+03

# relative error in y
sqrt(crossprod(ye) / crossprod(all[, 1]))

##         [,1]
## [1,] 0.01314

# relative error in regression coefficients
sqrt(crossprod(beta.y - beta.ye) / crossprod(beta.y))

##       [,1]
## [1,] 1.639

What happens if we center and scale predictors?

# let's center and scale the predictors except intercept
Xs <- cbind(1, scale(X[, -1], center = TRUE, scale = TRUE))
colnames(Xs)[1] <- "intercept"
Xs

##       intercept  GNPdef     GNP unemployed    armed       pop    year
##  [1,]         1 -1.7311 -1.5434    -0.8960 -1.46093 -1.411135 -1.5753
##  [2,]         1 -1.2214 -1.2905    -0.9292 -1.65348 -1.263926 -1.3653
##  [3,]         1 -1.2492 -1.3043     0.5230 -1.42357 -1.099898 -1.1552
##  [4,]         1 -1.1288 -1.0373     0.1687 -1.37471 -0.933713 -0.9452
##  [5,]         1 -0.5079 -0.5908    -1.1711  0.70743 -0.768965 -0.7351
##  [6,]         1 -0.3319 -0.4095    -1.3498  1.41872 -0.597174 -0.5251
##  [7,]         1 -0.2485 -0.2245    -1.4161  1.35118 -0.334958 -0.3151
##  [8,]         1 -0.1558 -0.2474     0.4117  1.06810 -0.173229 -0.1050
##  [9,]         1 -0.0446  0.0983    -0.3096  0.63414 -0.005175  0.1050
## [10,]         1  0.2705  0.3167    -0.3974  0.35969  0.188324  0.3151
## [11,]         1  0.6226  0.5541    -0.2754  0.27491  0.434295  0.5251
## [12,]         1  0.8450  0.5719     1.5920  0.04356  0.650652  0.7351
## [13,]         1  1.0118  0.9558     0.6631 -0.07858  0.854214  0.9452
## [14,]         1  1.1601  1.1560     0.7894 -0.13319  1.142019  1.1552
## [15,]         1  1.2990  1.3127     1.7258 -0.04984  1.499116  1.3653
## [16,]         1  1.4102  1.6821     0.8708  0.31658  1.819554  1.5753

# condition number is much smaller
Xs.svd <- svd(Xs)
Xs.svd$d

## [1] 8.30967 4.19882 4.00000 1.74682 0.47321 0.19566 0.07517

Xs.cond <- max(Xs.svd$d) / min(Xs.svd$d)
Xs.cond

## [1] 110.5

# fit linear regression
beta.ys <- lm(all[, 1] ~ Xs - 1)$coefficients
beta.ys

##  Xsintercept     XsGNPdef        XsGNP Xsunemployed      Xsarmed 
##      65317.0        162.5      -3560.2      -1887.8       -719.0 
##        Xspop       Xsyear 
##       -355.5       8708.5

# fit linear regression with perturbed y
beta.yes <- lm(all[, 1] + ye ~ Xs - 1)$coefficients
beta.yes

##  Xsintercept     XsGNPdef        XsGNP Xsunemployed      Xsarmed 
##      65636.3      -1412.7     -14256.4      -3846.6      -1345.5 
##        Xspop       Xsyear 
##       -542.7      22783.0

# relative error in y
sqrt(crossprod(ye) / crossprod(all[, 1]))

##         [,1]
## [1,] 0.01314

# relative error in regression coefficients
sqrt(crossprod(beta.ys - beta.yes) / crossprod(beta.ys))

##        [,1]
## [1,] 0.2707

Some further experiments.

# form the Grammian matrix
xtx <- crossprod(X)
# qr() of the original X shows it has rull column rank
qr(X)$rank

## [1] 7

# but qr() thinks the Grammian matrix is rank deficient
qr(xtx)$rank

## [1] 5

# chol() (without symmetric pivoting) considers Grammian matrix as full rank
# and goes through without error
chol(xtx)

##            intercept GNPdef     GNP unemployed   armed       pop      year
## intercept          4  406.7 1550794    12773.2 10426.8 469696.00 7818.0000
## GNPdef             0   41.8  381717     2246.2  1252.6  26379.51   18.2759
## GNP                0    0.0   49823     -311.9  -299.8   4196.92    1.7753
## unemployed         0    0.0       0     2820.6 -1644.3   3189.74    1.4530
## armed              0    0.0       0        0.0  1703.5    -46.25    0.4492
## pop                0    0.0       0        0.0     0.0   1463.20   -0.2819
## year               0    0.0       0        0.0     0.0      0.00    0.6693

# solve() thinks the Grammian matrix is singular
print(try(solve(xtx), silent = TRUE))

## [1] "Error in solve.default(xtx) : \n  system is computationally singular: reciprocal condition number = 3.50566e-20\n"
## attr(,"class")
## [1] "try-error"
## attr(,"condition")
## <simpleError in solve.default(xtx): system is computationally singular: reciprocal condition number = 3.50566e-20>

# clean and close
rm(list = ls())

Condition Number of Longley Data

Hua Zhou

Sep 24th, 2014