chiark - git - mdw - mLib/blob - utils/linreg.h

   1 /* -*-c-*-
   2  *
   3  * Simple linear regression
   4  *
   5  * (c) 2023 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of the mLib utilities library.
  11  *
  12  * mLib is free software: you can redistribute it and/or modify it under
  13  * the terms of the GNU Library General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or (at
  15  * your option) any later version.
  16  *
  17  * mLib is distributed in the hope that it will be useful, but WITHOUT
  18  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  19  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
  20  * License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with mLib.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 #ifndef MLIB_LINREG_H
  29 #define MLIB_LINREG_H
  30
  31 #ifdef __cplusplus
  32   extern "C" {
  33 #endif
  34
  35 /* Theory.  (This should be well-known.)
  36  *
  37  * We have a number of data points (x_i, y_i) for 0 <= i < n.  We believe
  38  * they lie close to a straight line y = m x + c, for unknown constants m and
  39  * c.  Let e_i = m x_i + c - y_i.  We seek the parameters which minimize
  40  * E = ∑_{0\le i<n} e_i^2.
  41  *
  42  * We begin by simplifying
  43  *
  44  *      E = ∑ e_i^2
  45  *        = ∑ (m x_i + c - y_i)^2
  46  *        = ∑ (m^2 x_i^2 + c^2 + y_i^2 + 2 c m x_i - 2 m x_i y_i - 2 c y_i) .
  47  *
  48  * (where all sums are over 0 <= i < n).  Taking partial derivatives,
  49  *
  50  *      ∂E/∂m = ∑ (2 m x_i^2 + 2 c x_i - 2 x_i y_i)
  51  *            = 2 (m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i)
  52  *
  53  * and
  54  *
  55  *      ∂E/∂c = ∑ (2 c + 2 m x_i - 2 y_i)
  56  *            = 2 (n c + m ∑ x_i - ∑ y_i) .
  57  *
  58  * Now we solve for m and c such that the partial derivatives vanish.  Note
  59  * that the second partial derivatives are nonnegative, being 2 ∑ x_i^2 and 2
  60  * n respectively, so we shall indeed find a minimum.
  61  *
  62  * Restating the problem,
  63  *
  64  *      m ∑ x_i^2 + c ∑ x_i - ∑ x_i y_i = 0
  65  *      m ∑ x_i   + c n     - ∑ y_i     = 0 .
  66  *
  67  * Writing E[X] = (∑ x_i)/n, E[X Y] = (∑ x_i y_i)/n, etc., this gives
  68  *
  69  *      m n E[X^2] + c n E[X] - n E[X Y] = 0
  70  *      m n E[X]   + c n      - n E[Y]   = 0 .
  71  *
  72  * We see a common factor of n, so we can take that out.  Now multiply the
  73  * latter equation by E[X] and subtract, giving
  74  *
  75  *      m (E[X^2] - E[X]^2) - (E[X Y] - E[X] E[Y])
  76  *
  77  * whence
  78  *
  79  *      m = (E[X Y] - E[X] E[Y])/(E[X^2] - E[X]^2) .
  80  *
  81  * The numerator is known as the covariance of X and Y, written
  82  *
  83  *      cov(X, Y) = σ_{XY} = E[X Y] - E[X] E[Y] ;
  84  *
  85  * the denominator is the variance of X, written
  86  *
  87  *      var(X) = σ_X = E[X^2] - E[X]^2 .
  88  *
  89  * The second equation gives
  90  *
  91  *      c = E[Y] - m E[X] .
  92  *
  93  * Also of interest is the correlation coefficient
  94  *
  95  *      r = σ_{XY}/√(σ_X σ_Y) ,
  96  *
  97  * which I'm not going to derive here.
  98  */
  99
 100 /*----- Header files ------------------------------------------------------*/
 101
 102 /*----- Data structures ---------------------------------------------------*/
 103
 104 struct linreg {
 105   double sum_x, sum_x2, sum_y, sum_y2, sum_x_y;
 106   unsigned long n;
 107 };
 108 #define LINREG_INIT { 0.0, 0.0, 0.0, 0.0, 0.0, 0 }
 109
 110 /*----- Functions provided ------------------------------------------------*/
 111
 112 /* --- @linreg_init@ --- *
 113  *
 114  * Arguments:   @struct linreg *lr@ = linear regression state
 115  *
 116  * Returns:     ---
 117  *
 118  * Use:         Initializes a linear-regression state ready for use.
 119  */
 120
 121 extern void linreg_init(struct linreg */*lr*/);
 122
 123 /* --- @linreg_update@ --- *
 124  *
 125  * Arguments:   @struct linreg *lr@ = linear regression state
 126  *              @double x, y@ = point coordinates
 127  *
 128  * Returns:     ---
 129  *
 130  * Use:         Informs the linear regression machinery of a point.
 131  *
 132  *              Note that the state size is constant, and independent of the
 133  *              number of points.
 134  */
 135
 136 extern void linreg_update(struct linreg */*lr*/, double /*x*/, double /*y*/);
 137
 138 /* --- @linreg_fit@ --- *
 139  *
 140  * Arguments:   @const struct linreg *lr@ = linear regression state
 141  *              @double *m_out, *c_out, *r_out@ = where to write outputs
 142  *
 143  * Returns:     ---
 144  *
 145  * Use:         Compute the best-fit line through the previously-specified
 146  *              points.  The line has the equation %$y = m x + c$%; %$m$% and
 147  *              %$c$% are written to @*m_out@ and @*c_out@ respectively, and
 148  *              the correlation coefficient %$r$% is written to @*r_out@.
 149  *              Any (or all, but that would be useless) of the output
 150  *              pointers may be null to discard that result.
 151  *
 152  *              At least one point must have been given.
 153  */
 154
 155 extern void linreg_fit(const struct linreg */*lr*/,
 156                        double */*m_out*/, double */*c_out*/,
 157                        double */*r_out*/);
 158
 159 /*----- That's all, folks -------------------------------------------------*/
 160
 161 #ifdef __cplusplus
 162   }
 163 #endif
 164
 165 #endif