# Getting Started with NNS: Clustering and Regression

library(NNS)
library(data.table)
require(knitr)
require(rgl)
require(meboot)

# Clustering and Regression

Below are some examples demonstrating unsupervised learning with NNS clustering and nonlinear regression using the resulting clusters. As always, for a more thorough description and definition, please view the References.

## NNS Partitioning NNS.part()

NNS.part is both a partitional and hierarchical clustering method. NNS iteratively partitions the joint distribution into partial moment quadrants, and then assigns a quadrant identification (1:4) at each partition.

NNS.part returns a data.table of observations along with their final quadrant identification. It also returns the regression points, which are the quadrant means used in NNS.reg.

x = seq(-5, 5, .05); y = x ^ 3

for(i in 1 : 4){NNS.part(x, y, order = i, Voronoi = TRUE, obs.req = 0)}

### X-only Partitioning

NNS.part offers a partitioning based on $$x$$ values only NNS.part(x, y, type = "XONLY", ...), using the entire bandwidth in its regression point derivation, and shares the same limit condition as partitioning via both $$x$$ and $$y$$ values.

for(i in 1 : 4){NNS.part(x, y, order = i, type = "XONLY", Voronoi = TRUE)}

Note the partition identifications are limited to 1’s and 2’s (left and right of the partition respectively), not the 4 values per the $$x$$ and $$y$$ partitioning.

## $order ## [1] 4 ## ##$dt
##   1: -5.00 -125.0000    q1111           q111
##   2: -4.95 -121.2874    q1111           q111
##   3: -4.90 -117.6490    q1111           q111
##   4: -4.85 -114.0841    q1111           q111
##   5: -4.80 -110.5920    q1111           q111
##  ---
## 197:  4.80  110.5920    q2222           q222
## 198:  4.85  114.0841    q2222           q222
## 199:  4.90  117.6490    q2222           q222
## 200:  4.95  121.2874    q2222           q222
## 201:  5.00  125.0000    q2222           q222
##
## $regression.points ## quadrant x y ## 1: q111 -4.4136250 -87.0661563 ## 2: q112 -3.1635313 -32.4322620 ## 3: q121 -1.9133437 -7.4753437 ## 4: q122 -0.6634375 -0.3238252 ## 5: q211 0.5866563 0.2366875 ## 6: q212 1.8366563 6.6437852 ## 7: q221 3.0862812 30.1590941 ## 8: q222 4.3613732 84.1050922 ## Clusters Used in Regression The right column of plots shows the corresponding regression (plus endpoints and central point) for the order of NNS partitioning. for(i in 1 : 3){NNS.part(x, y, order = i, obs.req = 0, Voronoi = TRUE, type = "XONLY") ; NNS.reg(x, y, order = i, ncores = 1)} # NNS Regression NNS.reg() NNS.reg can fit any $$f(x)$$, for both uni- and multivariate cases. NNS.reg returns a self-evident list of values provided below. ## Univariate: NNS.reg(x, y, ncores = 1) ##$R2
## [1] 0.9999996
##
## $SE ## [1] 0.04287529 ## ##$Prediction.Accuracy
## NULL
##
## $equation ## NULL ## ##$x.star
## NULL
##
## $derivative ## Coefficient X.Lower.Range X.Upper.Range ## 1: 74.252500000 -5.00000000 -4.95000000 ## 2: 71.302500000 -4.95000000 -4.80000000 ## 3: 66.982500000 -4.80000000 -4.65000000 ## 4: 62.797500000 -4.65000000 -4.50000000 ## 5: 58.747500000 -4.50000000 -4.35000000 ## 6: 54.832500000 -4.35000000 -4.20000000 ## 7: 51.052500000 -4.20000000 -4.05000000 ## 8: 47.103363520 -4.05000000 -3.88757324 ## 9: 43.323228231 -3.88757324 -3.70000000 ## 10: 39.427500000 -3.70000000 -3.55000000 ## 11: 36.232500000 -3.55000000 -3.40000000 ## 12: 33.172500000 -3.40000000 -3.25000000 ## 13: 30.247500000 -3.25000000 -3.10000000 ## 14: 27.457500000 -3.10000000 -2.95000000 ## 15: 24.802500000 -2.95000000 -2.80000000 ## 16: 22.075993537 -2.80000000 -2.63757324 ## 17: 19.513140049 -2.63757324 -2.45000000 ## 18: 16.927500000 -2.45000000 -2.30000000 ## 19: 14.857500000 -2.30000000 -2.15000000 ## 20: 12.922500000 -2.15000000 -2.00000000 ## 21: 11.122500000 -2.00000000 -1.85000000 ## 22: 9.457500000 -1.85000000 -1.70000000 ## 23: 7.927500000 -1.70000000 -1.55000000 ## 24: 6.423670900 -1.55000000 -1.38757324 ## 25: 5.078010868 -1.38757324 -1.20000000 ## 26: 3.802500000 -1.20000000 -1.05000000 ## 27: 2.857500000 -1.05000000 -0.90000000 ## 28: 2.047500000 -0.90000000 -0.75000000 ## 29: 1.372500000 -0.75000000 -0.60000000 ## 30: 0.832500000 -0.60000000 -0.45000000 ## 31: 0.427500000 -0.45000000 -0.30000000 ## 32: 0.145042838 -0.30000000 -0.13757324 ## 33: 0.034017962 -0.13757324 -0.04378662 ## 34: 0.004006248 -0.04378662 0.05000000 ## 35: 0.052500000 0.05000000 0.20000000 ## 36: 0.232500000 0.20000000 0.35000000 ## 37: 0.547500000 0.35000000 0.50000000 ## 38: 0.997500000 0.50000000 0.65000000 ## 39: 1.582500000 0.65000000 0.80000000 ## 40: 2.302500000 0.80000000 0.95000000 ## 41: 3.240170224 0.95000000 1.11242676 ## 42: 4.336091045 1.11242676 1.30000000 ## 43: 5.677500000 1.30000000 1.45000000 ## 44: 6.982500000 1.45000000 1.60000000 ## 45: 8.422500000 1.60000000 1.75000000 ## 46: 9.997500000 1.75000000 1.90000000 ## 47: 11.707500000 1.90000000 2.05000000 ## 48: 13.552500000 2.05000000 2.20000000 ## 49: 15.708643845 2.20000000 2.36242676 ## 50: 18.029602043 2.36242676 2.55000000 ## 51: 20.677500000 2.55000000 2.70000000 ## 52: 23.107500000 2.70000000 2.85000000 ## 53: 25.672500000 2.85000000 3.00000000 ## 54: 28.372500000 3.00000000 3.15000000 ## 55: 31.207500000 3.15000000 3.30000000 ## 56: 34.177500000 3.30000000 3.45000000 ## 57: 37.563906884 3.45000000 3.61242676 ## 58: 41.087904139 3.61242676 3.80000000 ## 59: 45.052500000 3.80000000 3.95000000 ## 60: 48.607500000 3.95000000 4.10000000 ## 61: 52.622363971 4.10000000 4.26242676 ## 62: 56.792988741 4.26242676 4.45000000 ## 63: 61.432500000 4.45000000 4.60000000 ## 64: 65.572500000 4.60000000 4.75000000 ## 65: 70.213534871 4.75000000 4.91242676 ## 66: 73.350809172 4.91242676 5.00000000 ## Coefficient X.Lower.Range X.Upper.Range ## ##$Point.est
## NULL
##
## $pred.int ## NULL ## ##$regression.points
##               x             y
##  1: -5.00000000 -1.250000e+02
##  2: -4.95000000 -1.212874e+02
##  3: -4.80000000 -1.105920e+02
##  4: -4.65000000 -1.005446e+02
##  5: -4.50000000 -9.112500e+01
##  6: -4.35000000 -8.231287e+01
##  7: -4.20000000 -7.408800e+01
##  8: -4.05000000 -6.643012e+01
##  9: -3.88757324 -5.877928e+01
## 10: -3.70000000 -5.065300e+01
## 11: -3.55000000 -4.473887e+01
## 12: -3.40000000 -3.930400e+01
## 13: -3.25000000 -3.432812e+01
## 14: -3.10000000 -2.979100e+01
## 15: -2.95000000 -2.567237e+01
## 16: -2.80000000 -2.195200e+01
## 17: -2.63757324 -1.836627e+01
## 18: -2.45000000 -1.470612e+01
## 19: -2.30000000 -1.216700e+01
## 20: -2.15000000 -9.938375e+00
## 21: -2.00000000 -8.000000e+00
## 22: -1.85000000 -6.331625e+00
## 23: -1.70000000 -4.913000e+00
## 24: -1.55000000 -3.723875e+00
## 25: -1.38757324 -2.680499e+00
## 26: -1.20000000 -1.728000e+00
## 27: -1.05000000 -1.157625e+00
## 28: -0.90000000 -7.290000e-01
## 29: -0.75000000 -4.218750e-01
## 30: -0.60000000 -2.160000e-01
## 31: -0.45000000 -9.112500e-02
## 32: -0.30000000 -2.700000e-02
## 33: -0.13757324 -3.441162e-03
## 34: -0.04378662 -2.507324e-04
## 35:  0.05000000  1.250000e-04
## 36:  0.20000000  8.000000e-03
## 37:  0.35000000  4.287500e-02
## 38:  0.50000000  1.250000e-01
## 39:  0.65000000  2.746250e-01
## 40:  0.80000000  5.120000e-01
## 41:  0.95000000  8.573750e-01
## 42:  1.11242676  1.383665e+00
## 43:  1.30000000  2.197000e+00
## 44:  1.45000000  3.048625e+00
## 45:  1.60000000  4.096000e+00
## 46:  1.75000000  5.359375e+00
## 47:  1.90000000  6.859000e+00
## 48:  2.05000000  8.615125e+00
## 49:  2.20000000  1.064800e+01
## 50:  2.36242676  1.319950e+01
## 51:  2.55000000  1.658138e+01
## 52:  2.70000000  1.968300e+01
## 53:  2.85000000  2.314913e+01
## 54:  3.00000000  2.700000e+01
## 55:  3.15000000  3.125588e+01
## 56:  3.30000000  3.593700e+01
## 57:  3.45000000  4.106363e+01
## 58:  3.61242676  4.716501e+01
## 59:  3.80000000  5.487200e+01
## 60:  3.95000000  6.162988e+01
## 61:  4.10000000  6.892100e+01
## 62:  4.26242676  7.746828e+01
## 63:  4.45000000  8.812113e+01
## 64:  4.60000000  9.733600e+01
## 65:  4.75000000  1.071719e+02
## 66:  4.91242676  1.185764e+02
## 67:  5.00000000  1.250000e+02
##               x             y
##
## $Fitted.xy ## x y y.hat NNS.ID gradient residuals standard.errors ## 1: -5.00 -125.0000 -125.0000 q1111111 74.25250 0.00000000 0.00000000 ## 2: -4.95 -121.2874 -121.2874 q1111111 71.30250 0.00000000 0.07312511 ## 3: -4.90 -117.6490 -117.7223 q1111112 71.30250 0.07325000 0.07312511 ## 4: -4.85 -114.0841 -114.1571 q1111121 71.30250 0.07300000 0.07312511 ## 5: -4.80 -110.5920 -110.5920 q1111121 66.98250 0.00000000 0.07087511 ## --- ## 197: 4.80 110.5920 110.6826 q2222212 70.21353 -0.09055174 0.08778340 ## 198: 4.85 114.0841 114.1932 q2222221 70.21353 -0.10910349 0.08778340 ## 199: 4.90 117.6490 117.7039 q2222221 70.21353 -0.05490523 0.08778340 ## 200: 4.95 121.2874 121.3325 q2222222 73.35081 -0.04508454 0.04508454 ## 201: 5.00 125.0000 125.0000 q2222222 73.35081 0.00000000 0.04508454 ## Multivariate: Multivariate regressions return a plot of $$y$$ and $$\hat{y}$$, as well as the regression points ($RPM) and partitions ($rhs.partitions) for each regressor. f = function(x, y) x ^ 3 + 3 * y - y ^ 3 - 3 * x y = x ; z <- expand.grid(x, y) g = f(z[ , 1], z[ , 2]) NNS.reg(z, g, order = "max", plot = FALSE, ncores = 1) ##$R2
## [1] 1
##
## $rhs.partitions ## Var1 Var2 ## 1: -5.00 -5 ## 2: -4.95 -5 ## 3: -4.90 -5 ## 4: -4.85 -5 ## 5: -4.80 -5 ## --- ## 40397: 4.80 5 ## 40398: 4.85 5 ## 40399: 4.90 5 ## 40400: 4.95 5 ## 40401: 5.00 5 ## ##$RPM
##        Var1  Var2         y.hat
##     1: -4.8 -4.80 -7.105427e-15
##     2: -4.8 -2.55 -8.726063e+01
##     3: -4.8 -2.50 -8.806700e+01
##     4: -4.8 -2.45 -8.883587e+01
##     5: -4.8 -2.40 -8.956800e+01
##    ---
## 40397: -2.6 -2.80  3.776000e+00
## 40398: -2.6 -2.75  2.770875e+00
## 40399: -2.6 -2.70  1.807000e+00
## 40400: -2.6 -2.65  8.836250e-01
## 40401: -2.6 -2.60  1.776357e-15
##
## $Point.est ## NULL ## ##$pred.int
## NULL
##

## NNS Dimension Reduction Regression

NNS.reg also provides a dimension reduction regression by including a parameter NNS.reg(x, y, dim.red.method = "cor", ...). Reducing all regressors to a single dimension using the returned equation NNS.reg(..., dim.red.method = "cor", ...)$equation. NNS.reg(iris[ , 1 : 4], iris[ , 5], dim.red.method = "cor", location = "topleft", ncores = 1)$equation

##        Variable Coefficient
## 1: Sepal.Length   0.7980781
## 2:  Sepal.Width  -0.4402896
## 3: Petal.Length   0.9354305
## 4:  Petal.Width   0.9381792
## 5:  DENOMINATOR   4.0000000

Thus, our model for this regression would be: $Species = \frac{0.798*Sepal.Length -0.44*Sepal.Width +0.935*Petal.Length +0.938*Petal.Width}{4}$

### Threshold

NNS.reg(x, y, dim.red.method = "cor", threshold = ...) offers a method of reducing regressors further by controlling the absolute value of required correlation.

NNS.reg(iris[ , 1 : 4], iris[ , 5], dim.red.method = "cor", threshold = .75, location = "topleft", ncores = 1)$equation ## Variable Coefficient ## 1: Sepal.Length 0.7980781 ## 2: Sepal.Width 0.0000000 ## 3: Petal.Length 0.9354305 ## 4: Petal.Width 0.9381792 ## 5: DENOMINATOR 3.0000000 Thus, our model for this further reduced dimension regression would be: $Species = \frac{\: 0.798*Sepal.Length + 0*Sepal.Width +0.935*Petal.Length +0.938*Petal.Width}{3}$ and the point.est = (...) operates in the same manner as the full regression above, again called with NNS.reg(...)$Point.est.

NNS.reg(iris[ , 1 : 4], iris[ , 5], dim.red.method = "cor", threshold = .75, point.est = iris[1 : 10, 1 : 4], location = "topleft", ncores = 1)$Point.est ## [1] 1 1 1 1 1 1 1 1 1 1 # Classification For a classification problem, we simply set NNS.reg(x, y, type = "CLASS", ...). NOTE: Base category of response variable should be 1, not 0 for classification problems. NNS.reg(iris[ , 1 : 4], iris[ , 5], type = "CLASS", point.est = iris[1 : 10, 1 : 4], location = "topleft", ncores = 1)$Point.est

##  [1] 1 1 1 1 1 1 1 1 1 1

# Cross-Validation NNS.stack()

The NNS.stack routine cross-validates for a given objective function the n.best parameter in the multivariate NNS.reg function as well as the threshold parameter in the dimension reduction NNS.reg version. NNS.stack can be used for classification:

NNS.stack(..., type = "CLASS", ...)

or continuous dependent variables:

NNS.stack(..., type = NULL, ...).

Any objective function obj.fn can be called using expression() with the terms predicted and actual, even from external packages such as Metrics.

NNS.stack(..., obj.fn = expression(Metrics::mape(actual, predicted)), objective = "min").

NNS.stack(IVs.train = iris[ , 1 : 4],
DV.train = iris[ , 5],
IVs.test = iris[1 : 10, 1 : 4],
dim.red.method = "cor",
obj.fn = expression( mean(round(predicted) == actual) ),
objective = "max", type = "CLASS",
folds = 1, ncores = 1)
Folds Remaining = 0
Current NNS.reg(... , threshold = 0.935 ) MAX Iterations Remaining = 2
Current NNS.reg(... , threshold = 0.795 ) MAX Iterations Remaining = 1
Current NNS.reg(... , threshold = 0.44 ) MAX Iterations Remaining = 0
Current NNS.reg(... , n.best = 1 ) MAX Iterations Remaining = 12
Current NNS.reg(... , n.best = 2 ) MAX Iterations Remaining = 11
Current NNS.reg(... , n.best = 3 ) MAX Iterations Remaining = 10
Current NNS.reg(... , n.best = 4 ) MAX Iterations Remaining = 9
$OBJfn.reg [1] 1$NNS.reg.n.best
[1] 4

$probability.threshold [1] 0.43875$OBJfn.dim.red
[1] 0.9666667

$NNS.dim.red.threshold [1] 0.935$reg
[1] 1 1 1 1 1 1 1 1 1 1

$reg.pred.int NULL$dim.red
[1] 1 1 1 1 1 1 1 1 1 1

$dim.red.pred.int NULL$stack
[1] 1 1 1 1 1 1 1 1 1 1

$pred.int NULL ## Increasing Dimensions Given multicollinearity is not an issue for nonparametric regressions as it is for OLS, in the case of an ill-fit univariate model a better option may be to increase the dimensionality of regressors with a copy of itself and cross-validate the number of clusters n.best via: NNS.stack(IVs.train = cbind(x, x), DV.train = y, method = 1, ...). set.seed(123) x = rnorm(100); y = rnorm(100) nns.params = NNS.stack(IVs.train = cbind(x, x), DV.train = y, method = 1, ncores = 1) NNS.reg(cbind(x, x), y, n.best = nns.params$NNS.reg.n.best,
point.est = cbind(x, x),
residual.plot = TRUE,
ncores = 1, confidence.interval = .95)

# References

If the user is so motivated, detailed arguments further examples are provided within the following: