Plotting a multicolumn regression problem that includes missingnessΒΆ

An example plotting a simultaneous fit of the sine and cosine functions. There are two redundant predictors, each of which has independent and random missingness.

../_images/sphx_glr_plot_multicolumn_001.png

Out:

  Beginning forward pass
---------------------------------------------------------------------
iter  parent  var  knot  mse          terms  gcv       rsq    grsq
---------------------------------------------------------------------
0     -       -    -     4893.487156  1      4894.466  0.000  0.000
1     0       6    2137  2847.258070  5      2853.532  0.418  0.417
2     3       6    1882  1604.248646  7      1609.395  0.672  0.671
3     4       6    3981  1091.498983  9      1096.098  0.777  0.776
4     0       5    3872  865.753079  13     871.146  0.823  0.822
5     1       5    6251  677.545451  17     683.136  0.862  0.860
6     11      5    7295  529.897093  19     534.806  0.892  0.891
7     3       5    1687  429.659153  23     434.512  0.912  0.911
8     12      5    8217  356.980379  25     361.376  0.927  0.926
9     12      6    6100  311.466783  29     315.937  0.936  0.935
10    23      5    7367  305.774544  31     310.476  0.938  0.937
11    5       6    3688  302.715204  33     307.679  0.938  0.937
-------------------------------------------------------------------
Stopping Condition 2: Improvement below threshold
Beginning pruning pass
------------------------------------------------
iter  bf  terms  mse     gcv      rsq    grsq
------------------------------------------------
0     -   33     302.73  307.691  0.938  0.937
1     2   32     302.73  307.536  0.938  0.937
2     12  31     302.73  307.381  0.938  0.937
3     13  30     302.73  307.226  0.938  0.937
4     3   29     302.71  307.058  0.938  0.937
5     10  28     302.73  306.917  0.938  0.937
6     16  27     302.83  306.872  0.938  0.937
7     30  26     303.00  306.881  0.938  0.937
8     32  25     303.26  306.990  0.938  0.937
9     15  24     303.80  307.382  0.938  0.937
10    25  23     305.32  308.772  0.938  0.937
11    1   22     306.62  309.932  0.937  0.937
12    31  21     309.57  312.748  0.937  0.936
13    29  20     314.07  317.136  0.936  0.935
14    19  19     322.35  325.336  0.934  0.934
15    22  18     337.54  340.500  0.931  0.930
16    6   17     347.58  350.448  0.929  0.928
17    28  16     361.15  363.944  0.926  0.926
18    24  15     371.25  373.935  0.924  0.924
19    27  14     439.31  442.265  0.910  0.910
20    7   13     465.62  468.521  0.905  0.904
21    21  12     570.82  574.085  0.883  0.883
22    5   11     613.94  617.143  0.875  0.874
23    18  10     766.50  770.116  0.843  0.843
24    8   9      941.18  945.144  0.808  0.807
25    14  8      1153.75  1158.031  0.764  0.763
26    9   7      1376.30  1380.712  0.719  0.718
27    17  6      1780.93  1785.752  0.636  0.635
28    20  5      2192.81  2197.638  0.552  0.551
29    26  4      2561.68  2566.044  0.477  0.476
30    11  3      3302.02  3305.984  0.325  0.325
31    4   2      3745.34  3747.962  0.235  0.234
32    23  1      4893.49  4894.466  0.000  0.000
--------------------------------------------------
Selected iteration: 0
Forward Pass
---------------------------------------------------------------------
iter  parent  var  knot  mse          terms  gcv       rsq    grsq
---------------------------------------------------------------------
0     -       -    -     4893.487156  1      4894.466  0.000  0.000
1     0       6    2137  2847.258070  5      2853.532  0.418  0.417
2     3       6    1882  1604.248646  7      1609.395  0.672  0.671
3     4       6    3981  1091.498983  9      1096.098  0.777  0.776
4     0       5    3872  865.753079   13     871.146   0.823  0.822
5     1       5    6251  677.545451   17     683.136   0.862  0.860
6     11      5    7295  529.897093   19     534.806   0.892  0.891
7     3       5    1687  429.659153   23     434.512   0.912  0.911
8     12      5    8217  356.980379   25     361.376   0.927  0.926
9     12      6    6100  311.466783   29     315.937   0.936  0.935
10    23      5    7367  305.774544   31     310.476   0.938  0.937
11    5       6    3688  302.715204   33     307.679   0.938  0.937
---------------------------------------------------------------------
Stopping Condition 2: Improvement below threshold

Pruning Pass
--------------------------------------------------
iter  bf  terms  mse      gcv       rsq    grsq
--------------------------------------------------
0     -   33     302.73   307.691   0.938  0.937
1     2   32     302.73   307.536   0.938  0.937
2     12  31     302.73   307.381   0.938  0.937
3     13  30     302.73   307.226   0.938  0.937
4     3   29     302.71   307.058   0.938  0.937
5     10  28     302.73   306.917   0.938  0.937
6     16  27     302.83   306.872   0.938  0.937
7     30  26     303.00   306.881   0.938  0.937
8     32  25     303.26   306.990   0.938  0.937
9     15  24     303.80   307.382   0.938  0.937
10    25  23     305.32   308.772   0.938  0.937
11    1   22     306.62   309.932   0.937  0.937
12    31  21     309.57   312.748   0.937  0.936
13    29  20     314.07   317.136   0.936  0.935
14    19  19     322.35   325.336   0.934  0.934
15    22  18     337.54   340.500   0.931  0.930
16    6   17     347.58   350.448   0.929  0.928
17    28  16     361.15   363.944   0.926  0.926
18    24  15     371.25   373.935   0.924  0.924
19    27  14     439.31   442.265   0.910  0.910
20    7   13     465.62   468.521   0.905  0.904
21    21  12     570.82   574.085   0.883  0.883
22    5   11     613.94   617.143   0.875  0.874
23    18  10     766.50   770.116   0.843  0.843
24    8   9      941.18   945.144   0.808  0.807
25    14  8      1153.75  1158.031  0.764  0.763
26    9   7      1376.30  1380.712  0.719  0.718
27    17  6      1780.93  1785.752  0.636  0.635
28    20  5      2192.81  2197.638  0.552  0.551
29    26  4      2561.68  2566.044  0.477  0.476
30    11  3      3302.02  3305.984  0.325  0.325
31    4   2      3745.34  3747.962  0.235  0.234
32    23  1      4893.49  4894.466  0.000  0.000
--------------------------------------------------
Selected iteration: 0

Earth Model
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Basis Function                                                                                                           Pruned  Coefficient 0  Coefficient 1
---------------------------------------------------------------------------------------------------------------------------------------------------------------
(Intercept)                                                                                                              No      -5.32206e+09   1.19712e+10
present(x6)                                                                                                              No      1.93529e+09    -4.35316e+09
missing(x6)                                                                                                              No      2.90294e+09    -6.52973e+09
C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                                                                        No      3.39758        -5.46026
C(x6|s=-1,-13.0806,-0.196727,11.4609)*present(x6)                                                                        No      -1.66364       -10.5413
C(x6|s=+1,11.4609,23.1185,23.8585)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                                     No      -0.425959      0.166173
C(x6|s=-1,11.4609,23.1185,23.8585)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                                     No      0.388521       -0.0691114
C(x6|s=+1,-32.9814,-25.9644,-13.0806)*C(x6|s=-1,-13.0806,-0.196727,11.4609)*present(x6)                                  No      -0.481603      -0.0625639
C(x6|s=-1,-32.9814,-25.9644,-13.0806)*C(x6|s=-1,-13.0806,-0.196727,11.4609)*present(x6)                                  No      0.26819        0.319225
present(x5)                                                                                                              No      2.41912e+09    -5.44144e+09
missing(x5)                                                                                                              No      2.41912e+09    -5.44144e+09
C(x5|s=+1,-9.38452,1.36818,14.8996)*present(x5)                                                                          No      0.121137       -10.2913
C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                                                                          No      -2.91523       -0.483027
present(x5)*present(x6)                                                                                                  No      9.67647e+08    -2.17658e+09
missing(x5)*present(x6)                                                                                                  No      9.67647e+08    -2.17658e+09
C(x5|s=+1,-19.348,1.389,13.869)*present(x5)*present(x6)                                                                  No      -5.21392       6.62229
C(x5|s=-1,-19.348,1.389,13.869)*present(x5)*present(x6)                                                                  No      9.22985        -6.41773
C(x5|s=+1,14.8996,28.4311,34.2832)*C(x5|s=+1,-9.38452,1.36818,14.8996)*present(x5)                                       No      -0.229284      0.365081
C(x5|s=-1,14.8996,28.4311,34.2832)*C(x5|s=+1,-9.38452,1.36818,14.8996)*present(x5)                                       No      0.444821       -0.129529
present(x5)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                                                            No      3.81312        -0.595014
missing(x5)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                                                            No      -0.416023      -4.86417
C(x5|s=+1,13.869,26.349,33.2422)*present(x5)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                           No      0.247209       -0.335965
C(x5|s=-1,13.869,26.349,33.2422)*present(x5)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)                           No      -0.42469       0.0927334
C(x5|s=+1,-35.584,-31.0829,-25.61)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                                       No      -0.417321      -0.122234
C(x5|s=-1,-35.584,-31.0829,-25.61)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                                       No      0.199904       0.309103
present(x6)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                                                              No      -5.554         7.50276
missing(x6)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                                                              No      2.63895        -7.98618
C(x6|s=+1,-35.0155,-30.0326,4.9814)*present(x6)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                          No      0.434228       0.0633731
C(x6|s=-1,-35.0155,-30.0326,4.9814)*present(x6)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)                          No      -0.266897      -0.316667
C(x5|s=+1,-25.61,-20.1372,-9.38452)*C(x5|s=+1,-35.584,-31.0829,-25.61)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)   No      0.0011487      0.00313568
C(x5|s=-1,-25.61,-20.1372,-9.38452)*C(x5|s=+1,-35.584,-31.0829,-25.61)*C(x5|s=-1,-9.38452,1.36818,14.8996)*present(x5)   No      -0.00453544    -0.00781131
C(x6|s=+1,23.8585,24.5984,32.2969)*C(x6|s=+1,11.4609,23.1185,23.8585)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)  No      0.00825191     0.00633597
C(x6|s=-1,23.8585,24.5984,32.2969)*C(x6|s=+1,11.4609,23.1185,23.8585)*C(x6|s=+1,-13.0806,-0.196727,11.4609)*present(x6)  No      -0.0136051     -0.00110534
---------------------------------------------------------------------------------------------------------------------------------------------------------------
MSE: 296.2991, GCV: 301.1581, RSQ: 0.9395, GRSQ: 0.9385

import numpy
import matplotlib.pyplot as plt

from pyearth import Earth

# Create some fake data
numpy.random.seed(2)
m = 10000
n = 10
X = 80 * numpy.random.uniform(size=(m, n)) - 40
X[:, 5] = X[:, 6] + numpy.random.normal(0, .1, m)
y1 = 100 * \
    (numpy.sin((X[:, 5] + X[:, 6]) / 20) - 4.0) + \
    10 * numpy.random.normal(size=m)
y2 = 100 * \
    (numpy.cos((X[:, 5] + X[:, 6]) / 20) - 4.0) + \
    10 * numpy.random.normal(size=m)
y = numpy.concatenate([y1[:, None], y2[:, None]], axis=1)
missing = numpy.random.binomial(1, .2, (m, n)).astype(bool)
X_full = X.copy()
X[missing] = None
idx5 = (1 - missing[:, 5]).astype(bool)
idx6 = (1 - missing[:, 6]).astype(bool)

# Fit an Earth model
model = Earth(max_degree=5, minspan_alpha=.5, allow_missing=True,
              enable_pruning=True, thresh=.001, smooth=True,
              verbose=True)
model.fit(X, y)

# Print the model
print(model.trace())
print(model.summary())

# Plot the model
y_hat = model.predict(X)
fig = plt.figure()

for j in [0, 1]:
    ax1 = fig.add_subplot(3, 4, 1 + 2*j)
    ax1.plot(X_full[idx5, 5], y[idx5, j], 'b.')
    ax1.plot(X_full[idx5, 5], y_hat[idx5, j], 'r.')
    ax1.set_xlim(-40, 40)
    ax1.set_title('x5 present')
    ax1.set_xlabel('x5')
    ax1.set_ylabel('sin' if j == 0 else 'cos')

    ax2 = fig.add_subplot(3, 4, 2 + 2*j)
    ax2.plot(X_full[idx6, 6], y[idx6, j], 'b.')
    ax2.plot(X_full[idx6, 6], y_hat[idx6, j], 'r.')
    ax2.set_xlim(-40, 40)
    ax2.set_title('x6 present')
    ax2.set_xlabel('x6')
    ax2.set_ylabel('sin' if j == 0 else 'cos')

    ax3 = fig.add_subplot(3, 4, 5 + 2*j, sharex=ax1)
    ax3.plot(X_full[~idx6, 5], y[~idx6, j], 'b.')
    ax3.plot(X_full[~idx6, 5], y_hat[~idx6, j], 'r.')
    ax3.set_title('x6 missing')
    ax3.set_xlabel('x5')
    ax3.set_ylabel('sin' if j == 0 else 'cos')

    ax4 = fig.add_subplot(3, 4, 6 + 2*j, sharex=ax2)
    ax4.plot(X_full[~idx5, 6], y[~idx5, j], 'b.')
    ax4.plot(X_full[~idx5, 6], y_hat[~idx5, j], 'r.')
    ax4.set_title('x5 missing')
    ax4.set_xlabel('x6')
    ax4.set_ylabel('sin' if j == 0 else 'cos')

    ax5 = fig.add_subplot(3, 4, 9 + 2*j, sharex=ax1)
    ax5.plot(X_full[(~idx6) & (~idx5), 5], y[(~idx6) & (~idx5), j], 'b.')
    ax5.plot(X_full[(~idx6) & (~idx5), 5], y_hat[(~idx6) & (~idx5), j], 'r.')
    ax5.set_title('both missing')
    ax5.set_xlabel('x5')
    ax5.set_ylabel('sin' if j == 0 else 'cos')

    ax6 = fig.add_subplot(3, 4, 10 + 2*j, sharex=ax2)
    ax6.plot(X_full[(~idx6) & (~idx5), 6], y[(~idx6) & (~idx5), j], 'b.')
    ax6.plot(X_full[(~idx6) & (~idx5), 6], y_hat[(~idx6) & (~idx5), j], 'r.')
    ax6.set_title('both missing')
    ax6.set_xlabel('x6')
    ax6.set_ylabel('sin' if j == 0 else 'cos')

fig.tight_layout()
plt.show()

Total running time of the script: (0 minutes 59.852 seconds)

Download Python source code: plot_multicolumn.py
Download IPython notebook: plot_multicolumn.ipynb