AFC - Abacus Formula Compiler for Java

Common Sub-Expressions In Large Sheets

Typical Sheet

The following sheet, while grossly oversimplified, seems to be a very typical sheet for the insurance models used by insurance companies (a relative of mine is a mathematician at one of them). The typical things are the large table and its inhomogeneity (in practice, the tables are much larger than the one here). This is not a simple repeating section with a perfectly homogeneous structure.

A B C D E F G H I
1 Inputs
2 Insured amount $15,000.00
(InsuredAmount)
3 Age 21
(Age)
4
5 Outputs
6 Cost per year 480
=INDEX(G12:G48,B9)
(InsuranceCostPerYear)
7
8 Helpers
9 Matching row 15
=MATCH(true,C12:C48,0.0)
10
11 Lookup Table AgeUpTo Match AgeUpTo (Copy) InsuredAmountUpTo Percentage InsuranceCostPerYear
12 20 false
=AND(Age<=D12,InsuredAmount<=E12)
20
=B12
$5,000.00 3% $150.00
=E12*F12
13 false
=AND(Age<=D13,InsuredAmount<=E13)
20
=B12
$6,000.00 3% $180.00
=E13*F13
14 false
=AND(Age<=D14,InsuredAmount<=E14)
20
=B12
$8,000.00 3% $240.00
=E14*F14
15 false
=AND(Age<=D15,InsuredAmount<=E15)
20
=B12
$10,000.00 4% $400.00
=E15*F15
Here we switch the percentage.
16 false
=AND(Age<=D16,InsuredAmount<=E16)
20
=B12
$13,000.00 4% $520.00
=E16*F16
17 false
=AND(Age<=D17,InsuredAmount<=E17)
20
=B12
$16,000.00 4% $640.00
=E17*F17
18 false
=AND(Age<=D18,InsuredAmount<=E18)
20
=B12
$20,000.00 5% $1,000.00
=E18*F18
Again.
19 false
=AND(Age<=D19,InsuredAmount<=E19)
20
=B12
$25,000.00 5% $1,250.00
=E19*F19
20 false
=AND(Age<=D20,InsuredAmount<=E20)
20
=B12
$30,000.00 5% $1,500.00
=E20*F20
21 22 false
=AND(Age<=D21,InsuredAmount<=E21)
22
=B21
$5,000.00 3% $150.00
=E21*F21
The next age group is similar, but with different percentages.
22 false
=AND(Age<=D22,InsuredAmount<=E22)
22
=B21
$6,000.00 3% $180.00
=E22*F22
23 false
=AND(Age<=D23,InsuredAmount<=E23)
22
=B21
$8,000.00 3% $240.00
=E23*F23
24 false
=AND(Age<=D24,InsuredAmount<=E24)
22
=B21
$10,000.00 3% $300.00
=E24*F24
25 false
=AND(Age<=D25,InsuredAmount<=E25)
22
=B21
$13,000.00 3% $390.00
=E25*F25
26 true
=AND(Age<=D26,InsuredAmount<=E26)
22
=B21
$16,000.00 3% $480.00
=E26*F26
27 true
=AND(Age<=D27,InsuredAmount<=E27)
22
=B21
$20,000.00 4% $800.00
=E27*F27
28 true
=AND(Age<=D28,InsuredAmount<=E28)
22
=B21
$25,000.00 4% $1,000.00
=E28*F28
29 true
=AND(Age<=D29,InsuredAmount<=E29)
22
=B21
$30,000.00 4% $1,200.00
=E29*F29
30 25 false
=AND(Age<=D30,InsuredAmount<=E30)
25
=B30
$5,000.00 3% $150.00
=E30*F30
Similar again, different percentages again.
31 false
=AND(Age<=D31,InsuredAmount<=E31)
25
=B30
$6,000.00 3% $180.00
=E31*F31
32 false
=AND(Age<=D32,InsuredAmount<=E32)
25
=B30
$8,000.00 3% $240.00
=E32*F32
33 false
=AND(Age<=D33,InsuredAmount<=E33)
25
=B30
$10,000.00 3% $300.00
=E33*F33
34 false
=AND(Age<=D34,InsuredAmount<=E34)
25
=B30
$13,000.00 3% $390.00
=E34*F34
35 true
=AND(Age<=D35,InsuredAmount<=E35)
25
=B30
$16,000.00 3% $480.00
=E35*F35
36 true
=AND(Age<=D36,InsuredAmount<=E36)
25
=B30
$20,000.00 3% $600.00
=E36*F36
37 true
=AND(Age<=D37,InsuredAmount<=E37)
25
=B30
$25,000.00 3% $750.00
=E37*F37
38 true
=AND(Age<=D38,InsuredAmount<=E38)
25
=B30
$30,000.00 3% $900.00
=E38*F38
39 60 true
=AND(Age<=D39,InsuredAmount<=E39)
60
=B39
$30,000.00 3.1600%
=2.0%+ABS(Age-50.0)/25.0*1.0%
$474.00
=InsuredAmount*F39
Complete switch of structure here.
40 200 false
=AND(Age<=D40,InsuredAmount<=E40)
200
=B40
$5,000.00 3% $450.00
=InsuredAmount*F40
Percentages again, but no fixed amounts.
41 false
=AND(Age<=D41,InsuredAmount<=E41)
200
=B40
$6,000.00 3% $450.00
=InsuredAmount*F41
42 false
=AND(Age<=D42,InsuredAmount<=E42)
200
=B40
$8,000.00 3% $450.00
=InsuredAmount*F42
43 false
=AND(Age<=D43,InsuredAmount<=E43)
200
=B40
$10,000.00 4% $600.00
=InsuredAmount*F43
44 false
=AND(Age<=D44,InsuredAmount<=E44)
200
=B40
$13,000.00 4% $600.00
=InsuredAmount*F44
45 true
=AND(Age<=D45,InsuredAmount<=E45)
200
=B40
$16,000.00 4% $600.00
=InsuredAmount*F45
46 true
=AND(Age<=D46,InsuredAmount<=E46)
200
=B40
$20,000.00 5% $750.00
=InsuredAmount*F46
47 true
=AND(Age<=D47,InsuredAmount<=E47)
200
=B40
$25,000.00 5% $750.00
=InsuredAmount*F47
48 true
=AND(Age<=D48,InsuredAmount<=E48)
200
=B40
$30,000.00 5% $750.00
=InsuredAmount*F48

Currently, SEJ compiles this to something like this:

public static final class OutputsCurrent implements Outputs
{
  private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */};
  private final Inputs i;

  public OutputsCurrent( Inputs _inputs )
  {
    this.i = _inputs;
  }

  private double getAge()
  {
    return this.i.getAge();
  }

  private double getInsuredAmount()
  {
    return this.i.getInsuredAmount();
  }

  public double getInsuranceCostPerYear()
  {
    int r;
    switch (r = ((int) getMatchingRow()) - 1) {

      case 27:
        return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01);

      case 28:
        return getInsuredAmount() * 0.03;
      case 29:
        return getInsuredAmount() * 0.03;
      case 30:
        return getInsuredAmount() * 0.03;
      case 31:
        return getInsuredAmount() * 0.04;
      case 32:
        return getInsuredAmount() * 0.04;
      case 33:
        return getInsuredAmount() * 0.04;

        /* ... for all rows of this form */

      default:
        if (r >= 0 && r < COST_TABLE.length) {
          return COST_TABLE[ r ];
        }
        return 0;
    }
  }

  private final double getMatchingRow()
  {
    int r = 1;

    // This is what really happens:
    if (((getAge() > 20.0 || getInsuredAmount() > 5000.0) ? 1.0 : 0.0) != 0.0) {
      r++;

      // This is what could be made to happen with a bit of peephole optimization:
      if (getAge() > 20.0 || getInsuredAmount() > 6000.0) {
        r++;

        if (getAge() > 20.0 || getInsuredAmount() > 8000.0) {
          r++;

          // ... for all of 37 cases */

        }
        else {
          r = 0;
        }
      }
    }

    return r;
  }

}

There are two stupid things here (apart from the possible peephole optimization noted in the code):

  • The method getMatchingRow() is a nightmare of redundancy and code bloat.
  • So is the repetition of return getInsuredAmount() * x in getInsuranceCostPerYear().

Now, code bloat now only means more code, it also means

  • less efficient use of high-speed CPU caches,
  • more JIT overhead, and
  • no clearly identifiable, small, and often-used hotspots in the code for the JITer to optimize heavily.

So the question is, how do we get more efficient code?

Homogenized Sheet

Now, this table could be made homogeneous, of course. It involves a formula type column in the table, and then another INDEX to select the proper computation variant to use (only the start of the sheet is shown):

A B C D E F G H I J K
2 Insured amount $15,000.00
(InsuredAmount)
3 Age 62
(Age)
4
5 Outputs
6 Cost per year 600
=VLOOKUP(true,C9:K45,9.0,false)
(InsuranceCostPerYear)
7
8 Lookup Table AgeUpTo Match AgeUpTo (Copy) InsuredAmountUpTo Percentage Style Style 1 Style 2 Style 3 Insured amount
9 20 false
=AND(Age<=D9,InsuredAmount<=E9)
20
=B9
$5,000.00 3% 1 $150.00
=E9*F9
$372.00
=InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%)
$450.00
=InsuredAmount*F9
$150.00
=INDEX(H9:J9,G9)
10 false
=AND(Age<=D10,InsuredAmount<=E10)
20
=B9
$6,000.00 3% 1 $180.00
=E10*F10
$372.00
=InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%)
$450.00
=InsuredAmount*F10
$180.00
=INDEX(JD10:JF10,G10)
11 false
=AND(Age<=D11,InsuredAmount<=E11)
20
=B9
$8,000.00 3% 1 $240.00
=E11*F11
$372.00
=InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%)
$450.00
=InsuredAmount*F11
$240.00
=INDEX(JD11:JF11,G11)

Now we could simply declare a non-dynamic section over the table (a feature SEJ would have to learn), which would make VLOOKUP() iterate over a statically initialized section (from the constants found in the range) and return the final result for the first match.

This would perform well even if the static section were a sliding cursor across static double[] arrays (for very large tables). Only if we need more than one return value from the table would we run into trouble: the lookup methods only ever return a single value. So we would have to re-iterate the table for every needed value.

If the static section were held in an array of objects (rather than a sliding cursor), we could use an initial MATCH and then multiple calls to INDEX to work around this with decent performance (at the cost of slightly higher memory usage).

Normalized Sheet

The approach taken above is like relational normalization, really. Done more fully, we get:

A B C D E F G
1 Inputs
2 Insured amount $15,000.00
(InsuredAmount)
3 Age 62
(Age)
4
5 Outputs
6 Cost per year 600
=INDEX(B11:B13,B10)
(InsuranceCostPerYear)
7
8 Helpers
9 Matching row 34
=MATCH(true,C16:C52,0.0)
10 Computation style 3
=INDEX(G16:G52,B9)
11 Style 1 $640.00
=INDEX(E16:E52,B9)*INDEX(F16:F52,B9)
Amount rounded up; percentage from table
12 Style 2 $372.00
=InsuredAmount*(2.0%+ABS(Age-50.0)/25.0*1.0%)
Exact amount; percentage computed from age
13 Style 3 $600.00
=InsuredAmount*INDEX(F16:F52,B9)
Exact amount; percentage from table
14
15 Lookup Table AgeUpTo Match AgeUpTo (Copy) InsuredAmountUpTo Percentage Style
16 20 false
=AND(Age<=D16,InsuredAmount<=E16)
20
=B16
$5,000.00 3% 1
17 false
=AND(Age<=D17,InsuredAmount<=E17)
20
=B16
$6,000.00 3% 1
18 false
=AND(Age<=D18,InsuredAmount<=E18)
20
=B16
$8,000.00 3% 1

If the static section is held in an array of objects (rather than a sliding cursor), we would get decent performance from this. The sliding cursor would have to re-iterate for every INDEX() we have here.

Typical Sheet Again

All this is not how typical Excel users think, however. They will much sooner come up with the first version of the sheet than the others.

So the question is, can we compile the first version to efficient code? Maybe by looking at the sheet in R1C1-notation. Then we clearly see that there are a lot of identical formulas referencing neighbouring cells.

So we could try to compile all shared formulas (after some threshold) to single methods with sheet, row, column parameters. Such a method could then access neighbours through some sort of dispatch routine which routes a sheet, row, column index to the appropriate cell getter. If we analyze for every shared formula the set of possible neighbours, we might get fairly compact and efficient dispatch methods, too.

Here’s a sketch of what I mean by this:

public static final class OutputsDesired implements Outputs
{
  private static final double[] COST_TABLE = { 150, 180, 240 /* ... all of 37 elts */};
  private static final double[] PERCENT_TABLE = { .03, .03, .03 /* ... all of 37 elts */};
  private static final double[] AGE_UPTO = { 20, 20, 20 /* ... all of 37 elts */};
  private static final double[] INSURED_AMOUNT_UPTO = { 5000, 6000, 8000 };

  private final Inputs i;

  public OutputsDesired( Inputs _inputs )
  {
    this.i = _inputs;
  }

  private double getAge()
  {
    return this.i.getAge();
  }

  private double getInsuredAmount()
  {
    return this.i.getInsuredAmount();
  }

  public double getInsuranceCostPerYear()
  {
    int r;
    switch (r = ((int) getMatchingRow()) - 1) {

      case 27:
        return getInsuredAmount() * (0.02 + Math.abs( getAge() - 50 ) / 25 * 0.01);

      case 28:
      case 29:
      case 30:
      case 31:
      case 32:
      case 33:
        return getStyle3( 0, r + 1, 7 );

      default:
        if (r >= 0 && r < COST_TABLE.length) {
          return COST_TABLE[ r ];
        }
        return 0;

    }
  }

  private double getStyle3( int _s, int _r, int _c )
  {
    return getInsuredAmount() * PERCENT_TABLE[ _r - 13 ];
  }

  private final double getMatchingRow()
  {
    for (int r = 13; r < 50; r++) {
      if (getMatch( 1, r, 3 )) return r;
    }
    return 0;
  }

  /**
   * The boolean return here is again a peephole optimization.
   */
  private final boolean getMatch( int _s, int _r, int _c )
  {
    return (getAge() <= getAgeUpto( _s, _r, _c + 1 ) && getInsuredAmount() <= getInsuredAmountUpto( _s, _r, _c + 2 ));
  }

  /**
   * If AFC sees that all the cells that can be referenced relatively are constant, then there
   * is no switch, just a table.
   */
  private double getAgeUpto( int _s, int _r, int _c )
  {
    return AGE_UPTO[ _r - 13 ];
  }

  private double getInsuredAmountUpto( int _s, int _r, int _c )
  {
    return INSURED_AMOUNT_UPTO[ _r - 13 ];
  }

}